Add motion tracking functionality to VideoEditor

This commit introduces motion tracking capabilities, allowing users to add and remove tracking points on video frames. The tracking state is managed with new attributes, and the crop functionality is enhanced to follow the tracked motion. Additionally, the user interface is updated to reflect the tracking status, and keyboard shortcuts are added for toggling tracking and clearing points. This feature improves the editing experience by enabling dynamic cropping based on motion analysis.
2025-09-17 01:14:26 +02:00
parent 66b23834fd
commit fdf7d98850
1 changed files with 234 additions and 113 deletions
--- a/croppa/main.py
+++ b/croppa/main.py
@@ -601,6 +601,10 @@ class VideoEditor:
        self.cached_frame_number = None
        self.cached_transform_hash = None

+        # Motion tracking state
+        self.tracking_points = {}  # {frame_number: [(x, y), ...]} in original frame coords
+        self.tracking_enabled = False
+
        # Project view mode
        self.project_view_mode = False
        self.project_view = None
@@ -643,7 +647,9 @@ class VideoEditor:
                'display_offset': self.display_offset,
                'playback_speed': getattr(self, 'playback_speed', 1.0),
                'seek_multiplier': getattr(self, 'seek_multiplier', 1.0),
-                'is_playing': getattr(self, 'is_playing', False)
+                'is_playing': getattr(self, 'is_playing', False),
+                'tracking_enabled': self.tracking_enabled,
+                'tracking_points': {str(k): v for k, v in self.tracking_points.items()}
            }

            with open(state_file, 'w') as f:
@@ -719,6 +725,12 @@ class VideoEditor:
            if 'is_playing' in state:
                self.is_playing = state['is_playing']
                print(f"Loaded is_playing: {self.is_playing}")
+            if 'tracking_enabled' in state:
+                self.tracking_enabled = state['tracking_enabled']
+                print(f"Loaded tracking_enabled: {self.tracking_enabled}")
+            if 'tracking_points' in state and isinstance(state['tracking_points'], dict):
+                self.tracking_points = {int(k): v for k, v in state['tracking_points'].items()}
+                print(f"Loaded tracking_points: {sum(len(v) for v in self.tracking_points.values())} points")

            # Validate cut markers against current video length
            if self.cut_start_frame is not None and self.cut_start_frame >= self.total_frames:
@@ -1087,9 +1099,15 @@ class VideoEditor:
        # Apply brightness/contrast first (to original frame for best quality)
        processed_frame = self.apply_brightness_contrast(processed_frame)

-        # Apply crop
+        # Apply crop (with motion tracking follow if enabled)
        if self.crop_rect:
            x, y, w, h = self.crop_rect
+            if self.tracking_enabled:
+                interp = self._get_interpolated_tracking_position(getattr(self, 'current_frame', 0))
+                if interp:
+                    cx, cy = interp
+                    x = int(round(cx - w / 2))
+                    y = int(round(cy - h / 2))
            x, y, w, h = int(x), int(y), int(w), int(h)
            # Ensure crop is within frame bounds
            x = max(0, min(x, processed_frame.shape[1] - 1))
@@ -1129,6 +1147,135 @@ class VideoEditor:

        return processed_frame

+    # --- Motion tracking helpers ---
+    def _get_effective_crop_rect_for_frame(self, frame_number):
+        """Compute crop rect applied to a given frame, considering tracking follow."""
+        if not self.crop_rect:
+            return (0, 0, self.frame_width, self.frame_height)
+        x, y, w, h = map(int, self.crop_rect)
+        if self.tracking_enabled:
+            pos = self._get_interpolated_tracking_position(frame_number)
+            if pos:
+                cx, cy = pos
+                x = int(round(cx - w / 2))
+                y = int(round(cy - h / 2))
+        # Clamp to frame bounds
+        x = max(0, min(x, self.frame_width - 1))
+        y = max(0, min(y, self.frame_height - 1))
+        w = min(w, self.frame_width - x)
+        h = min(h, self.frame_height - y)
+        return (x, y, w, h)
+
+    def _get_interpolated_tracking_position(self, frame_number):
+        """Linear interpolation between keyed tracking points.
+        Returns (x, y) in original frame coords or None.
+        """
+        if not self.tracking_points:
+            return None
+        frames = sorted(self.tracking_points.keys())
+        if not frames:
+            return None
+        if frame_number in self.tracking_points and self.tracking_points[frame_number]:
+            pts = self.tracking_points[frame_number]
+            return (sum(p[0] for p in pts) / len(pts), sum(p[1] for p in pts) / len(pts))
+        if frame_number < frames[0]:
+            pts = self.tracking_points[frames[0]]
+            return (sum(p[0] for p in pts) / len(pts), sum(p[1] for p in pts) / len(pts)) if pts else None
+        if frame_number > frames[-1]:
+            pts = self.tracking_points[frames[-1]]
+            return (sum(p[0] for p in pts) / len(pts), sum(p[1] for p in pts) / len(pts)) if pts else None
+        for i in range(len(frames) - 1):
+            f1, f2 = frames[i], frames[i + 1]
+            if f1 <= frame_number <= f2:
+                pts1 = self.tracking_points.get(f1) or []
+                pts2 = self.tracking_points.get(f2) or []
+                if not pts1 or not pts2:
+                    continue
+                x1 = sum(p[0] for p in pts1) / len(pts1)
+                y1 = sum(p[1] for p in pts1) / len(pts1)
+                x2 = sum(p[0] for p in pts2) / len(pts2)
+                y2 = sum(p[1] for p in pts2) / len(pts2)
+                t = (frame_number - f1) / (f2 - f1) if f2 != f1 else 0.0
+                return (x1 + t * (x2 - x1), y1 + t * (y2 - y1))
+        return None
+
+    def _map_original_to_screen(self, ox, oy):
+        """Map a point in original frame coords to canvas screen coords."""
+        cx, cy, cw, ch = self._get_effective_crop_rect_for_frame(getattr(self, 'current_frame', 0))
+        px = ox - cx
+        py = oy - cy
+        angle = self.rotation_angle
+        if angle in (90, 270):
+            rotated_w, rotated_h = ch, cw
+        else:
+            rotated_w, rotated_h = cw, ch
+        if angle == 90:
+            rx, ry = py, rotated_w - px
+        elif angle == 180:
+            rx, ry = rotated_w - px, rotated_h - py
+        elif angle == 270:
+            rx, ry = rotated_h - py, px
+        else:
+            rx, ry = px, py
+        zx = rx * self.zoom_factor
+        zy = ry * self.zoom_factor
+        base_w, base_h = rotated_w, rotated_h
+        disp_w = int(base_w * self.zoom_factor)
+        disp_h = int(base_h * self.zoom_factor)
+        available_height = self.window_height - (0 if self.is_image_mode else self.TIMELINE_HEIGHT)
+        scale = min(self.window_width / max(1, disp_w), available_height / max(1, disp_h))
+        if scale < 1.0:
+            final_w = int(disp_w * scale)
+            final_h = int(disp_h * scale)
+        else:
+            final_w = disp_w
+            final_h = disp_h
+            scale = 1.0
+        start_x = (self.window_width - final_w) // 2
+        start_y = (available_height - final_h) // 2
+        sx = int(round(start_x + zx * scale))
+        sy = int(round(start_y + zy * scale))
+        return sx, sy
+
+    def _map_screen_to_original(self, sx, sy):
+        """Map a point on canvas screen coords back to original frame coords."""
+        cx, cy, cw, ch = self._get_effective_crop_rect_for_frame(getattr(self, 'current_frame', 0))
+        angle = self.rotation_angle
+        if angle in (90, 270):
+            rotated_w, rotated_h = ch, cw
+        else:
+            rotated_w, rotated_h = cw, ch
+        disp_w = int(rotated_w * self.zoom_factor)
+        disp_h = int(rotated_h * self.zoom_factor)
+        available_height = self.window_height - (0 if self.is_image_mode else self.TIMELINE_HEIGHT)
+        scale = min(self.window_width / max(1, disp_w), available_height / max(1, disp_h))
+        if scale < 1.0:
+            final_w = int(disp_w * scale)
+            final_h = int(disp_h * scale)
+        else:
+            final_w = disp_w
+            final_h = disp_h
+            scale = 1.0
+        start_x = (self.window_width - final_w) // 2
+        start_y = (available_height - final_h) // 2
+        zx = (sx - start_x) / max(1e-6, scale)
+        zy = (sy - start_y) / max(1e-6, scale)
+        rx = zx / max(1e-6, self.zoom_factor)
+        ry = zy / max(1e-6, self.zoom_factor)
+        if angle == 90:
+            px, py = rotated_w - ry, rx
+        elif angle == 180:
+            px, py = rotated_w - rx, rotated_h - ry
+        elif angle == 270:
+            px, py = ry, rotated_h - rx
+        else:
+            px, py = rx, ry
+        ox = px + cx
+        oy = py + cy
+        ox = max(0, min(int(round(ox)), self.frame_width - 1))
+        oy = max(0, min(int(round(oy)), self.frame_height - 1))
+        return ox, oy
+
    def clear_transformation_cache(self):
        """Clear the cached transformation to force recalculation"""
        self.cached_transformed_frame = None
@@ -1665,10 +1812,13 @@ class VideoEditor:
        seek_multiplier_text = (
            f" | Seek: {self.seek_multiplier:.1f}x" if self.seek_multiplier != 1.0 else ""
        )
+        motion_text = (
+            f" | Motion: {self.tracking_enabled}" if self.tracking_enabled else ""
+        )
        if self.is_image_mode:
-            info_text = f"Image | Zoom: {self.zoom_factor:.1f}x{rotation_text}{brightness_text}{contrast_text}"
+            info_text = f"Image | Zoom: {self.zoom_factor:.1f}x{rotation_text}{brightness_text}{contrast_text}{motion_text}"
        else:
-            info_text = f"Frame: {self.current_frame}/{self.total_frames} | Speed: {self.playback_speed:.1f}x | Zoom: {self.zoom_factor:.1f}x{seek_multiplier_text}{rotation_text}{brightness_text}{contrast_text} | {'Playing' if self.is_playing else 'Paused'}"
+            info_text = f"Frame: {self.current_frame}/{self.total_frames} | Speed: {self.playback_speed:.1f}x | Zoom: {self.zoom_factor:.1f}x{seek_multiplier_text}{rotation_text}{brightness_text}{contrast_text}{motion_text} | {'Playing' if self.is_playing else 'Paused'}"
        cv2.putText(
            canvas,
            info_text,
@@ -1754,6 +1904,19 @@ class VideoEditor:
                1,
            )

+        # Draw tracking overlays (points and interpolated cross)
+        pts = self.tracking_points.get(self.current_frame, []) if not self.is_image_mode else []
+        for (ox, oy) in pts:
+            sx, sy = self._map_original_to_screen(ox, oy)
+            cv2.circle(canvas, (sx, sy), 6, (0, 255, 0), -1)
+            cv2.circle(canvas, (sx, sy), 6, (255, 255, 255), 1)
+        if self.tracking_enabled and not self.is_image_mode:
+            interp = self._get_interpolated_tracking_position(self.current_frame)
+            if interp:
+                sx, sy = self._map_original_to_screen(interp[0], interp[1])
+                cv2.line(canvas, (sx - 10, sy), (sx + 10, sy), (255, 0, 0), 2)
+                cv2.line(canvas, (sx, sy - 10), (sx, sy + 10), (255, 0, 0), 2)
+
        # Draw timeline
        self.draw_timeline(canvas)

@@ -1812,6 +1975,31 @@ class VideoEditor:
        if flags & cv2.EVENT_FLAG_CTRLKEY and event == cv2.EVENT_LBUTTONDOWN:
            self.zoom_center = (x, y)

+        # Handle right-click for tracking points (no modifiers)
+        if event == cv2.EVENT_RBUTTONDOWN and not (flags & (cv2.EVENT_FLAG_CTRLKEY | cv2.EVENT_FLAG_SHIFTKEY)):
+            if not self.is_image_mode:
+                ox, oy = self._map_screen_to_original(x, y)
+                threshold = 50
+                removed = False
+                if self.current_frame in self.tracking_points:
+                    pts_screen = []
+                    for idx, (px, py) in enumerate(self.tracking_points[self.current_frame]):
+                        sxp, syp = self._map_original_to_screen(px, py)
+                        pts_screen.append((idx, sxp, syp))
+                    for idx, sxp, syp in pts_screen:
+                        if (sxp - x) ** 2 + (syp - y) ** 2 <= threshold ** 2:
+                            del self.tracking_points[self.current_frame][idx]
+                            if not self.tracking_points[self.current_frame]:
+                                del self.tracking_points[self.current_frame]
+                            self.show_feedback_message("Tracking point removed")
+                            removed = True
+                            break
+                if not removed:
+                    self.tracking_points.setdefault(self.current_frame, []).append((int(ox), int(oy)))
+                    self.show_feedback_message("Tracking point added")
+                self.clear_transformation_cache()
+                self.save_state()
+
        # Handle scroll wheel for zoom (Ctrl + scroll)
        if flags & cv2.EVENT_FLAG_CTRLKEY:
            if event == cv2.EVENT_MOUSEWHEEL:
@@ -1832,119 +2020,34 @@ class VideoEditor:
        if self.current_display_frame is None:
            return

-        # Get the original frame dimensions
-        original_height, original_width = self.current_display_frame.shape[:2]
-        available_height = self.window_height - (0 if self.is_image_mode else self.TIMELINE_HEIGHT)
-
-        # Calculate how the original frame is displayed (after crop/zoom/rotation)
-        display_frame = self.apply_crop_zoom_and_rotation(
-            self.current_display_frame.copy()
-        )
-        if display_frame is None:
-            return
-
-        display_height, display_width = display_frame.shape[:2]
-
-        # Calculate scale for the display frame
-        scale = min(
-            self.window_width / display_width, available_height / display_height
-        )
-        if scale < 1.0:
-            final_display_width = int(display_width * scale)
-            final_display_height = int(display_height * scale)
-        else:
-            final_display_width = display_width
-            final_display_height = display_height
-            scale = 1.0
-
-        start_x = (self.window_width - final_display_width) // 2
-        start_y = (available_height - final_display_height) // 2
-
-        # Convert screen coordinates to display frame coordinates
-        display_x = (x - start_x) / scale
-        display_y = (y - start_y) / scale
-        display_w = w / scale
-        display_h = h / scale
-
-        # Clamp to display frame bounds
-        display_x = max(0, min(display_x, display_width))
-        display_y = max(0, min(display_y, display_height))
-        display_w = min(display_w, display_width - display_x)
-        display_h = min(display_h, display_height - display_y)
-
-        # Now we need to convert from the display frame coordinates back to original frame coordinates
-        # The display frame is the result of: original -> crop -> rotation -> zoom
-
-        # Step 1: Reverse zoom
-        if self.zoom_factor != 1.0:
-            display_x = display_x / self.zoom_factor
-            display_y = display_y / self.zoom_factor
-            display_w = display_w / self.zoom_factor
-            display_h = display_h / self.zoom_factor
-
-        # Step 2: Reverse rotation
-        if self.rotation_angle != 0:
-            # Get the dimensions of the frame after crop but before rotation
-            if self.crop_rect:
-                crop_w, crop_h = int(self.crop_rect[2]), int(self.crop_rect[3])
-            else:
-                crop_w, crop_h = original_width, original_height
-
-            # Apply inverse rotation to coordinates
-            # The key insight: we need to use the dimensions of the ROTATED frame for the coordinate transformation
-            # because the coordinates we have are in the rotated coordinate system
-            if self.rotation_angle == 90:
-                # 90° clockwise rotation: (x,y) -> (y, rotated_width-x-w)
-                # The rotated frame has dimensions: height x width (swapped)
-                rotated_w, rotated_h = crop_h, crop_w
-                new_x = display_y
-                new_y = rotated_w - display_x - display_w
-                new_w = display_h
-                new_h = display_w
-            elif self.rotation_angle == 180:
-                # 180° rotation: (x,y) -> (width-x-w, height-y-h)
-                new_x = crop_w - display_x - display_w
-                new_y = crop_h - display_y - display_h
-                new_w = display_w
-                new_h = display_h
-            elif self.rotation_angle == 270:
-                # 270° clockwise rotation: (x,y) -> (rotated_height-y-h, x)
-                # The rotated frame has dimensions: height x width (swapped)
-                rotated_w, rotated_h = crop_h, crop_w
-                new_x = rotated_h - display_y - display_h
-                new_y = display_x
-                new_w = display_h
-                new_h = display_w
-            else:
-                new_x, new_y, new_w, new_h = display_x, display_y, display_w, display_h
-
-            display_x, display_y, display_w, display_h = new_x, new_y, new_w, new_h
-
-        # Step 3: Convert from cropped frame coordinates to original frame coordinates
-        original_x = display_x
-        original_y = display_y
-        original_w = display_w
-        original_h = display_h
-
-        # Add the crop offset to get back to original frame coordinates
-        if self.crop_rect:
-            crop_x, crop_y, crop_w, crop_h = self.crop_rect
-            original_x += crop_x
-            original_y += crop_y
+        # Map both corners from screen to original to form an axis-aligned crop
+        # All coordinates are in reference to the ORIGINAL frame
+        # User input arrives in processed display space → map back to original
+        x2 = x + w
+        y2 = y + h
+        ox1, oy1 = self._map_screen_to_original(x, y)
+        ox2, oy2 = self._map_screen_to_original(x2, y2)
+        left = min(ox1, ox2)
+        top = min(oy1, oy2)
+        right = max(ox1, ox2)
+        bottom = max(oy1, oy2)
+        original_x = left
+        original_y = top
+        original_w = max(10, right - left)
+        original_h = max(10, bottom - top)

        # Clamp to original frame bounds
-        original_x = max(0, min(original_x, original_width))
-        original_y = max(0, min(original_y, original_height))
-        original_w = min(original_w, original_width - original_x)
-        original_h = min(original_h, original_height - original_y)
+        original_x = max(0, min(original_x, self.frame_width - 1))
+        original_y = max(0, min(original_y, self.frame_height - 1))
+        original_w = min(original_w, self.frame_width - original_x)
+        original_h = min(original_h, self.frame_height - original_y)

-        if original_w > 10 and original_h > 10:  # Minimum size check
-            # Save current crop for undo
+        if original_w > 10 and original_h > 10:
            if self.crop_rect:
                self.crop_history.append(self.crop_rect)
            self.crop_rect = (original_x, original_y, original_w, original_h)
            self.clear_transformation_cache()
-            self.save_state()  # Save state when crop is set
+            self.save_state()

    def seek_to_timeline_position(self, mouse_x, bar_x_start, bar_width):
        """Seek to position based on mouse click on timeline"""
@@ -2291,12 +2394,15 @@ class VideoEditor:
            return False


-    def _process_frame_for_render(self, frame, output_width: int, output_height: int):
+    def _process_frame_for_render(self, frame, output_width: int, output_height: int, frame_number: int = None):
        """Process a single frame for rendering (optimized for speed)"""
        try:
            # Apply crop (vectorized operation)
            if self.crop_rect:
-                x, y, w, h = map(int, self.crop_rect)
+                if frame_number is None:
+                    x, y, w, h = map(int, self.crop_rect)
+                else:
+                    x, y, w, h = map(int, self._get_effective_crop_rect_for_frame(frame_number))

                # Clamp coordinates to frame bounds
                h_frame, w_frame = frame.shape[:2]
@@ -2409,7 +2515,7 @@ class VideoEditor:
                    if not ret:
                        break

-                    processed_frame = self._process_frame_for_render(frame, output_width, output_height)
+                    processed_frame = self._process_frame_for_render(frame, output_width, output_height, start_frame + i)
                    if processed_frame is not None:
                        if i == 0:
                            print(f"Processed frame dimensions: {processed_frame.shape[1]}x{processed_frame.shape[0]}")
@@ -2500,6 +2606,11 @@ class VideoEditor:
            print("  U: Undo crop")
            print("  C: Clear crop")
            print()
+            print("Motion Tracking:")
+            print("  Right-click: Add/remove tracking point (at current frame)")
+            print("  v: Toggle motion tracking on/off")
+            print("  V: Clear all tracking points")
+            print()
            print("Other Controls:")
            print("  Ctrl+Scroll: Zoom in/out")
            print("  Shift+S: Save screenshot")
@@ -2772,6 +2883,16 @@ class VideoEditor:
                else:
                    print(f"DEBUG: File '{self.video_path.stem}' does not contain '_edited_'")
                    print("Enter key only overwrites files with '_edited_' in the name. Use 'n' to create new files.")
+            elif key == ord("v"):
+                # Toggle motion tracking on/off
+                self.tracking_enabled = not self.tracking_enabled
+                self.show_feedback_message(f"Motion tracking {'ON' if self.tracking_enabled else 'OFF'}")
+                self.save_state()
+            elif key == ord("V"):
+                # Clear all tracking points
+                self.tracking_points = {}
+                self.show_feedback_message("Tracking points cleared")
+                self.save_state()
            elif key == ord("t"):
                # Marker looping only for videos
                if not self.is_image_mode: