Explorar o código

fix:数据集变更bug修复

tanjingyu hai 1 mes
pai
achega
681e4020ab
Modificáronse 2 ficheiros con 52 adicións e 11 borrados
  1. 46 6
      app/services/storage_service.py
  2. 6 5
      app/services/webhook_service.py

+ 46 - 6
app/services/storage_service.py

@@ -44,15 +44,55 @@ class StorageService:
             return None
 
     def rollback_version(self, version: DataVersion):
-        """Remove a version and all its associated file records.
-
-        Used when a push didn't produce any data file changes,
-        meaning it was a code-only commit with no snapshot value.
-        """
+        """Remove a version and all its associated file records."""
         self.db.query(DataFile).filter(DataFile.version_id == version.id).delete()
         self.db.delete(version)
         self.db.commit()
-        logger.info(f"Rolled back empty version {version.id}")
+        logger.info(f"Rolled back unchanged version {version.id}")
+
+    def is_snapshot_changed(self, version: DataVersion, has_new_uploads: bool) -> bool:
+        """
+        Determine if this version represents a meaningful change compared to the previous one.
+        A version is meaningful if:
+        1. Any file content changed (has_new_uploads is True)
+        2. The set of files (paths) is different from the previous version record for this stage.
+        """
+        # 1. If content changed, it's definitely a new version
+        if has_new_uploads:
+            return True
+
+        # 2. Get current file paths
+        current_files = self.db.query(DataFile).filter(DataFile.version_id == version.id).all()
+        if not current_files:
+            return False  # No data files at all, don't keep
+
+        # 3. Find the most recent previous version for this project and stage
+        prev_version = (
+            self.db.query(DataVersion)
+            .filter(
+                DataVersion.project_id == version.project_id,
+                DataVersion.stage == version.stage,
+                DataVersion.id != version.id
+            )
+            .order_by(DataVersion.created_at.desc())
+            .first()
+        )
+
+        # 4. If there's no previous version, we keep this one as the baseline
+        if not prev_version:
+            return True
+
+        # 5. Compare the set of relative paths
+        prev_files = self.db.query(DataFile).filter(DataFile.version_id == prev_version.id).all()
+        
+        prev_paths = {f.relative_path for f in prev_files}
+        curr_paths = {f.relative_path for f in current_files}
+
+        if prev_paths != curr_paths:
+            logger.info(f"Snapshot file set changed for stage '{version.stage}': {len(prev_paths)} -> {len(curr_paths)} files")
+            return True
+
+        return False
 
     async def process_file_with_sha(
         self,

+ 6 - 5
app/services/webhook_service.py

@@ -114,16 +114,17 @@ class WebhookService:
             logger.info(f"Processing stage '{stage_name}' with {len(outputs)} output rules")
 
             # Process outputs and check if any file actually changed
-            has_changes = await self._process_outputs(
+            has_new_uploads = await self._process_outputs(
                 version, outputs, owner, repo_name, after_sha
             )
 
-            if not has_changes:
-                # No data files changed — this was a code-only push, discard the snapshot
+            # Check if this version represents a real change (content OR file set)
+            if not self.storage.is_snapshot_changed(version, has_new_uploads):
+                # No changes detected — this was a code-only push, discard the snapshot
                 self.storage.rollback_version(version)
                 logger.info(
-                    f"Stage '{stage_name}': no data file changes detected. "
-                    f"Version discarded (code-only push)."
+                    f"Stage '{stage_name}': no data changes detected (content and file set same). "
+                    f"Version discarded."
                 )
 
     async def _process_outputs(