Просмотр исходного кода

fix:去掉过期的directory_depth、groupkey逻辑

tanjingyu 7 часов назад
Родитель
Сommit
8c8341c769

+ 0 - 5
app/main.py

@@ -97,11 +97,6 @@ def read_root():
     return FileResponse(os.path.join(STATIC_DIR, "records.html"), media_type="text/html")
 
 
-@app.get("/fs")
-def filesystem_page():
-    """Serve the legacy file system UI."""
-    return FileResponse(os.path.join(STATIC_DIR, "index.html"), media_type="text/html")
-
 
 @app.get("/records")
 def records_page():

+ 142 - 102
app/services/gogs_client.py

@@ -1,3 +1,4 @@
+import asyncio
 import httpx
 from app.config import settings
 import logging
@@ -8,12 +9,44 @@ logger = logging.getLogger(__name__)
 # Default timeout for API requests (seconds)
 _DEFAULT_TIMEOUT = 30.0
 
+# Maximum concurrent requests to Gogs to avoid 401 rate-limiting
+_MAX_CONCURRENT = 3
+
 
 class GogsClient:
     def __init__(self):
         self.base_url = settings.GOGS_URL.rstrip('/')
         self.token = settings.GOGS_TOKEN
         self.headers = {"Authorization": f"token {self.token}"}
+        self._semaphore = asyncio.Semaphore(_MAX_CONCURRENT)
+        self._client: httpx.AsyncClient | None = None
+
+    # ---- shared client & throttled helpers --------------------------------
+
+    def _get_client(self) -> httpx.AsyncClient:
+        """Lazily create a shared httpx client with connection pooling."""
+        if self._client is None or self._client.is_closed:
+            self._client = httpx.AsyncClient(
+                timeout=_DEFAULT_TIMEOUT,
+                headers=self.headers,
+            )
+        return self._client
+
+    async def _get(self, url: str) -> httpx.Response:
+        """Throttled GET via shared client."""
+        async with self._semaphore:
+            return await self._get_client().get(url)
+
+    async def _post(self, url: str, **kwargs) -> httpx.Response:
+        """Throttled POST via shared client."""
+        async with self._semaphore:
+            return await self._get_client().post(url, **kwargs)
+
+    async def aclose(self):
+        """Close the underlying HTTP client (connection pool)."""
+        if self._client and not self._client.is_closed:
+            await self._client.aclose()
+            self._client = None
 
     # ------------------------------------------------------------------
     # Repository discovery
@@ -27,12 +60,11 @@ class GogsClient:
         """
         repos: list[dict] = []
 
-        async with httpx.AsyncClient(timeout=_DEFAULT_TIMEOUT) as client:
-            url = f"{self.base_url}/api/v1/user/repos"
-            resp = await client.get(url, headers=self.headers)
-            resp.raise_for_status()
-            batch = resp.json()
-            repos.extend(batch)
+        url = f"{self.base_url}/api/v1/user/repos"
+        resp = await self._get(url)
+        resp.raise_for_status()
+        batch = resp.json()
+        repos.extend(batch)
 
         logger.info(f"Fetched {len(repos)} repositories in total")
         return repos
@@ -44,10 +76,9 @@ class GogsClient:
     async def list_repo_webhooks(self, owner: str, repo: str) -> list[dict]:
         """List all webhooks configured on a repository."""
         url = f"{self.base_url}/api/v1/repos/{owner}/{repo}/hooks"
-        async with httpx.AsyncClient(timeout=_DEFAULT_TIMEOUT) as client:
-            resp = await client.get(url, headers=self.headers)
-            resp.raise_for_status()
-            return resp.json()
+        resp = await self._get(url)
+        resp.raise_for_status()
+        return resp.json()
 
     async def create_repo_webhook(
         self,
@@ -72,10 +103,9 @@ class GogsClient:
             "events": events or ["push"],
             "active": True,
         }
-        async with httpx.AsyncClient(timeout=_DEFAULT_TIMEOUT) as client:
-            resp = await client.post(url, headers=self.headers, json=payload)
-            resp.raise_for_status()
-            return resp.json()
+        resp = await self._post(url, json=payload)
+        resp.raise_for_status()
+        return resp.json()
 
     # ------------------------------------------------------------------
     # Manifest / file operations (existing)
@@ -84,12 +114,11 @@ class GogsClient:
     async def get_manifest(self, owner: str, repo: str, ref: str) -> str | None:
         """Fetch manifest.yaml raw content from a given ref (commit / branch)."""
         url = f"{self.base_url}/api/v1/repos/{owner}/{repo}/raw/{ref}/manifest.yaml"
-        async with httpx.AsyncClient(timeout=_DEFAULT_TIMEOUT) as client:
-            resp = await client.get(url, headers=self.headers)
-            if resp.status_code == 404:
-                return None
-            resp.raise_for_status()
-            return resp.text
+        resp = await self._get(url)
+        if resp.status_code == 404:
+            return None
+        resp.raise_for_status()
+        return resp.text
 
     @staticmethod
     def _candidate_refs(ref: str, fallback_ref: str | None = None) -> list[str]:
@@ -103,10 +132,9 @@ class GogsClient:
     async def get_tree(self, owner: str, repo: str, path: str = "", *, ref: str) -> list:
         """Get the file tree of a repository."""
         url = f"{self.base_url}/api/v1/repos/{owner}/{repo}/contents/{path}?ref={ref}"
-        async with httpx.AsyncClient(timeout=_DEFAULT_TIMEOUT) as client:
-            resp = await client.get(url, headers=self.headers)
-            resp.raise_for_status()
-            return resp.json()
+        resp = await self._get(url)
+        resp.raise_for_status()
+        return resp.json()
 
     async def get_file_info(
         self,
@@ -122,71 +150,68 @@ class GogsClient:
         Returns dict with 'sha', 'size', 'path' or None if not found.
         """
         refs = self._candidate_refs(ref, fallback_ref)
-        async with httpx.AsyncClient(timeout=_DEFAULT_TIMEOUT) as client:
-            for i, candidate_ref in enumerate(refs):
-                url = f"{self.base_url}/api/v1/repos/{owner}/{repo}/contents/{file_path}?ref={candidate_ref}"
-                try:
-                    resp = await client.get(url, headers=self.headers)
-                    if resp.status_code == 404:
-                        continue
-                    resp.raise_for_status()
-                    data = resp.json()
-                    # contents API returns file info directly for single file
-                    if isinstance(data, dict) and data.get("type") == "file":
-                        if i > 0:
-                            logger.info(
-                                f"File info fallback hit for {file_path}: "
-                                f"primary ref '{ref}' -> '{candidate_ref}'"
-                            )
-                        return {
-                            "path": file_path,
-                            "sha": data.get("sha"),
-                            "size": data.get("size", 0),
-                            "type": "blob",
-                            "ref": candidate_ref,
-                        }
-                except httpx.HTTPStatusError as e:
-                    logger.error(
-                        f"Failed to get file info for {file_path} at ref '{candidate_ref}': {e}"
-                    )
+        for i, candidate_ref in enumerate(refs):
+            url = f"{self.base_url}/api/v1/repos/{owner}/{repo}/contents/{file_path}?ref={candidate_ref}"
+            try:
+                resp = await self._get(url)
+                if resp.status_code == 404:
+                    continue
+                resp.raise_for_status()
+                data = resp.json()
+                # contents API returns file info directly for single file
+                if isinstance(data, dict) and data.get("type") == "file":
+                    if i > 0:
+                        logger.info(
+                            f"File info fallback hit for {file_path}: "
+                            f"primary ref '{ref}' -> '{candidate_ref}'"
+                        )
+                    return {
+                        "path": file_path,
+                        "sha": data.get("sha"),
+                        "size": data.get("size", 0),
+                        "type": "blob",
+                        "ref": candidate_ref,
+                    }
+            except httpx.HTTPStatusError as e:
+                logger.error(
+                    f"Failed to get file info for {file_path} at ref '{candidate_ref}': {e}"
+                )
         return None
 
     async def get_directory_tree(self, owner: str, repo: str, dir_path: str, *, ref: str) -> list:
         """Get all files under a specific directory (recursive) using concurrency."""
-        import asyncio
         all_files = []
 
-        async with httpx.AsyncClient(timeout=_DEFAULT_TIMEOUT, headers=self.headers) as client:
-            async def fetch_contents(path: str):
-                """Recursively fetch directory contents using contents API in parallel."""
-                url = f"{self.base_url}/api/v1/repos/{owner}/{repo}/contents/{path}?ref={ref}"
-                try:
-                    resp = await client.get(url)
-                    if resp.status_code == 404:
-                        logger.warning(f"Directory not found: {path}")
-                        return
-                    resp.raise_for_status()
-                    data = resp.json()
-
-                    if isinstance(data, list):
-                        tasks = []
-                        for item in data:
-                            if item.get("type") == "file":
-                                all_files.append({
-                                    "path": item.get("path"),
-                                    "sha": item.get("sha"),
-                                    "size": item.get("size", 0),
-                                    "type": "blob"
-                                })
-                            elif item.get("type") == "dir":
-                                tasks.append(fetch_contents(item.get("path")))
-                        
-                        if tasks:
-                            await asyncio.gather(*tasks)
-                except Exception as e:
-                    logger.error(f"Failed to get contents for {path}: {e}")
-
-            await fetch_contents(dir_path)
+        async def fetch_contents(path: str):
+            """Recursively fetch directory contents using contents API in parallel."""
+            url = f"{self.base_url}/api/v1/repos/{owner}/{repo}/contents/{path}?ref={ref}"
+            try:
+                resp = await self._get(url)
+                if resp.status_code == 404:
+                    logger.warning(f"Directory not found: {path}")
+                    return
+                resp.raise_for_status()
+                data = resp.json()
+
+                if isinstance(data, list):
+                    tasks = []
+                    for item in data:
+                        if item.get("type") == "file":
+                            all_files.append({
+                                "path": item.get("path"),
+                                "sha": item.get("sha"),
+                                "size": item.get("size", 0),
+                                "type": "blob"
+                            })
+                        elif item.get("type") == "dir":
+                            tasks.append(fetch_contents(item.get("path")))
+
+                    if tasks:
+                        await asyncio.gather(*tasks)
+            except Exception as e:
+                logger.error(f"Failed to get contents for {path}: {e}")
+
+        await fetch_contents(dir_path)
         return all_files
 
     async def get_file_content(
@@ -201,22 +226,37 @@ class GogsClient:
         """Download raw file content."""
         # Gogs raw file URL format: /{owner}/{repo}/raw/{ref}/{path}
         refs = self._candidate_refs(ref, fallback_ref)
-        async with httpx.AsyncClient(timeout=_DEFAULT_TIMEOUT) as client:
-            last_resp = None
-            for i, candidate_ref in enumerate(refs):
-                url = f"{self.base_url}/api/v1/repos/{owner}/{repo}/raw/{candidate_ref}/{file_path}"
-                resp = await client.get(url, headers=self.headers)
-                last_resp = resp
-                if resp.status_code == 404:
-                    continue
-                resp.raise_for_status()
-                if i > 0:
-                    logger.info(
-                        f"File content fallback hit for {file_path}: "
-                        f"primary ref '{ref}' -> '{candidate_ref}'"
-                    )
-                return resp.content
-
-            if last_resp is not None:
-                last_resp.raise_for_status()
-            raise httpx.HTTPError(f"Failed to download content for {file_path}")
+        last_resp = None
+        for i, candidate_ref in enumerate(refs):
+            url = f"{self.base_url}/api/v1/repos/{owner}/{repo}/raw/{candidate_ref}/{file_path}"
+            resp = await self._get(url)
+            last_resp = resp
+            if resp.status_code == 404:
+                continue
+            resp.raise_for_status()
+            if i > 0:
+                logger.info(
+                    f"File content fallback hit for {file_path}: "
+                    f"primary ref '{ref}' -> '{candidate_ref}'"
+                )
+            return resp.content
+
+        if last_resp is not None:
+            last_resp.raise_for_status()
+        raise httpx.HTTPError(f"Failed to download content for {file_path}")
+
+
+async def main():
+    gog = GogsClient()
+    r = await gog.get_file_info(
+        owner="nieqi",
+        repo="aigc_how_decode_base_project_0916_1125",
+    file_path="examples/糯米和Kilala/待解构帖子.json",
+    ref="what_decode_0104_tanjingyu",)
+    print(r)
+    await gog.aclose()
+
+
+import asyncio
+if __name__ == '__main__':
+    asyncio.run(main())

+ 49 - 73
app/services/storage_service.py

@@ -62,21 +62,24 @@ class StorageService:
         return has_new_uploads
 
     def aggregate_version_records(self, version: DataVersion):
-        """Aggregate files in a version into DataRecord groups based on parent directory."""
+        """Aggregate files in a version into DataRecord groups.
+
+        group_key is always the output file's relative_path, ensuring
+        a strict 1 Output : N Inputs mapping per record.
+        """
         from collections import defaultdict
-        
+
         # 1. Clean existing records for this version (idempotency)
         self.db.query(DataRecord).filter(DataRecord.version_id == version.id).delete()
-        
+
         files = self.db.query(DataFile).filter(DataFile.version_id == version.id).all()
-        
-        # 2. Group by dirname
+
+        # 2. Group by explicit group_key (= output file relative_path)
         groups = defaultdict(lambda: {"inputs": [], "outputs": []})
-        
+
         for f in files:
-            # Group key falls back to immediate parent directory if not explicitly saved in f.group_key
-            group_key = f.group_key if f.group_key is not None else os.path.dirname(f.relative_path)
-            
+            group_key = f.group_key if f.group_key else f.relative_path
+
             file_data = {
                 "id": f.id,
                 "relative_path": f.relative_path,
@@ -91,17 +94,15 @@ class StorageService:
             if f.direction == "input":
                 groups[group_key]["inputs"].append(file_data)
             else:
-                # Treat 'output' or None as output by default for rendering purposes
                 groups[group_key]["outputs"].append(file_data)
-        # 3. Insert aggregated records (One record per output file, with differential logic)
+
+        # 3. Insert aggregated records
         for group_key, data in groups.items():
             inputs = data["inputs"]
             outputs = data["outputs"]
-            
-            # 预先获取该 group_key 下所有输出路径的最新状态
-            # 用于判定当前这次 Commit 是否真的产生了变化
-            latest_hashes = {}
-            past_records = (
+
+            # Pre-fetch the latest state to perform differential evaluation
+            past_record = (
                 self.db.query(DataRecord)
                 .filter(
                     DataRecord.project_id == version.project_id,
@@ -110,48 +111,38 @@ class StorageService:
                     DataRecord.version_id != version.id
                 )
                 .order_by(DataRecord.created_at.desc())
-                .limit(200) # 覆盖常见的目录文件数
-                .all()
+                .first()
             )
-            for pr in past_records:
-                # 记录每一个输出路径对应的最新 hash
-                p_out = pr.outputs[0]['relative_path'] if pr.outputs else None
-                if p_out not in latest_hashes:
-                    latest_hashes[p_out] = pr.content_hash
-
-            # 如果没有输出,也创建一个逻辑组
-            output_groups = [[o] for o in outputs] if outputs else [[]]
-            
-            for out_list in output_groups:
-                # 1. 计算当前指纹:输入集合 SHA + 对应输出 SHA
-                all_shas = [f["file_sha"] for f in inputs] + [f["file_sha"] for f in out_list]
-                all_shas.sort()
-                combined_string = "|".join(all_shas)
-                content_hash = hashlib.sha256(combined_string.encode('utf-8')).hexdigest()
-
-                # 2. 差异化判定
-                out_path = out_list[0]['relative_path'] if out_list else None
-                if out_path in latest_hashes and latest_hashes[out_path] == content_hash:
-                    logger.info(f"Skipping unchanged record: {group_key} -> {out_path}")
-                    continue
-
-                # 3. 只有变化了才记录
-                record = DataRecord(
-                    project_id=version.project_id,
-                    version_id=version.id,
-                    stage=version.stage,
-                    commit_id=version.commit_id,
-                    commit_message=version.commit_message,
-                    group_key=group_key,
-                    inputs=inputs,
-                    outputs=out_list,
-                    content_hash=content_hash,
-                    author=version.author,
-                )
-                self.db.add(record)
-            
+
+            latest_hash = past_record.content_hash if past_record else None
+
+            # Calculate current fingerprint: combined SHA of Inputs + Output
+            all_shas = [f["file_sha"] for f in inputs] + [f["file_sha"] for f in outputs]
+            all_shas.sort()
+            combined_string = "|".join(all_shas)
+            content_hash = hashlib.sha256(combined_string.encode('utf-8')).hexdigest()
+
+            # Differential check — skip if unchanged
+            if latest_hash == content_hash:
+                logger.info(f"Skipping unchanged record: {group_key}")
+                continue
+
+            record = DataRecord(
+                project_id=version.project_id,
+                version_id=version.id,
+                stage=version.stage,
+                commit_id=version.commit_id,
+                commit_message=version.commit_message,
+                group_key=group_key,
+                inputs=inputs,
+                outputs=outputs,
+                content_hash=content_hash,
+                author=version.author,
+            )
+            self.db.add(record)
+
         self.db.commit()
-        logger.info(f"Aggregated version {version.id} with refined differential logic.")
+        logger.info(f"Aggregated version {version.id} into records.")
 
     async def process_file_with_sha(
         self,
@@ -163,7 +154,6 @@ class StorageService:
         direction: str = None,
         label: str = None,
         extract_json_key: str = None,
-        directory_depth: int = None,
         group_key: str = None,
         content_ref: str | None = None,
     ) -> bool:
@@ -191,20 +181,7 @@ class StorageService:
         should_extract = bool(extract_json_key and relative_path.lower().endswith(".json"))
         extracted_val = None
 
-        # Calculate group_key: explicit override > directory_depth > dirname fallback
-        if group_key is not None:
-            calc_group_key = group_key
-        elif directory_depth is not None and directory_depth > 0:
-            parts = relative_path.split("/")
-            # Remove filename
-            if len(parts) > 1:
-                parts = parts[:-1]
-                # Combine up to directory_depth
-                calc_group_key = "/".join(parts[:directory_depth])
-            else:
-                calc_group_key = "" # File is in root directory
-        else:
-            calc_group_key = os.path.dirname(relative_path)  # Default fallback
+        calc_group_key = group_key if group_key else relative_path
 
         download_ref = content_ref or version.commit_id
 
@@ -234,13 +211,12 @@ class StorageService:
 
         if last_file and last_file.file_sha == file_sha:
             # ── Unchanged: reuse previous OSS key, still record a snapshot entry ──
-            # Optimization: Try to reuse previously extracted value if the SHA hasn't changed
             if should_extract:
                 if last_file.extracted_value is not None:
                     extracted_val = last_file.extracted_value
                 else:
                     extracted_val = await _extract_val()
-            
+
             new_file = DataFile(
                 version_id=version.id,
                 relative_path=relative_path,

+ 157 - 24
app/services/webhook_service.py

@@ -5,7 +5,7 @@ import os
 import asyncio
 import re
 from sqlalchemy.orm import Session
-from app.models import Project, DataVersion
+from app.models import Project, DataVersion, DataFile, DataRecord
 from app.services.gogs_client import GogsClient
 from app.services.storage_service import StorageService
 
@@ -82,13 +82,6 @@ class WebhookService:
         # 6. Process stages
         stages = manifest.get("stages", [])
 
-        # Backward compatibility: old single-stage format
-        if not stages and manifest.get("stage"):
-            stages = [{
-                "name": manifest.get("stage"),
-                "outputs": manifest.get("outputs", [])
-            }]
-
         if not stages:
             logger.error("Manifest missing stages configuration")
             return
@@ -157,6 +150,31 @@ class WebhookService:
             else:
                 self.storage.aggregate_version_records(version)
 
+        # ── Backfill: supplement missing paired inputs in recent versions ──
+        # Handles the case where output files were committed BEFORE their
+        # paired input files.  When the input file arrives in a later push,
+        # we retroactively attach it to the older records that were missing it.
+        for stage_config in stages:
+            stage_name = stage_config.get("name")
+            outputs = stage_config.get("outputs", [])
+            if not stage_name or not outputs:
+                continue
+            has_paired = any(
+                oc.get("paired_input") or oc.get("paired_inputs")
+                for oc in outputs
+            )
+            if not has_paired:
+                continue
+            if not manifest_changed and not self._is_stage_affected(outputs, all_changed_files):
+                continue
+            await self._backfill_incomplete_records(
+                project.id, stage_name, outputs,
+                owner, repo_name, after_sha,
+            )
+
+        # Close the shared HTTP client (connection pool)
+        await self.gogs.aclose()
+
     def _get_all_changed_files(self, payload: dict) -> set[str]:
         """Extract all added, modified, and removed files from all commits in payload."""
         files = set()
@@ -245,21 +263,10 @@ class WebhookService:
         if "paired_input" in output_config:
             paired_configs.append(output_config["paired_input"])
 
-        # Calculate group_key here so both paired input and output can share it
-        directory_depth = output_config.get("directory_depth")
-        if directory_depth is not None and directory_depth > 0:
-            parts = file_path.split("/")
-            if len(parts) > 1:
-                group_key = "/".join(parts[:-1][:directory_depth])
-            else:
-                group_key = ""
-        elif paired_configs:
-            # If we have paired inputs, use the full file path as a unique group key 
-            # to avoid "cross-talk" where multiple outputs in the same directory 
-            # share all paired inputs from that directory.
-            group_key = file_path
-        else:
-            group_key = os.path.dirname(file_path)
+        # The group_key is ALWAYS the output file's exact relative_path.
+        # This guarantees 1 Output : N Inputs mapping strictly.
+        group_key = file_path
+
 
         # Deduplicate API calls and DB entries across concurrently running tasks
         task_key = (file_path, group_key)
@@ -277,7 +284,6 @@ class WebhookService:
             direction=output_config.get("direction"),
             label=output_config.get("label"),
             extract_json_key=output_config.get("extract_json_key"),
-            directory_depth=directory_depth,
             group_key=group_key,
             content_ref=file_info.get("ref", ref),
         )
@@ -325,6 +331,133 @@ class WebhookService:
 
         return has_change
 
+    async def _backfill_incomplete_records(
+        self, project_id: str, stage_name: str, outputs: list,
+        owner: str, repo_name: str, current_commit: str,
+    ):
+        """Backfill paired inputs that were missing when records were first created.
+
+        When output files are committed before their paired input files, the
+        initial records will have empty inputs.  This method finds those
+        incomplete records and tries to fetch the now-available paired inputs
+        using the *current* commit (which may contain files added later).
+        """
+        recent_versions = (
+            self.db.query(DataVersion)
+            .filter(
+                DataVersion.project_id == project_id,
+                DataVersion.stage == stage_name,
+            )
+            .order_by(DataVersion.created_at.desc())
+            .limit(20)
+            .all()
+        )
+
+        # Cache Gogs lookups so we don't fetch the same path twice
+        file_info_cache: dict[str, dict | None] = {}
+
+        for version in recent_versions:
+            records = (
+                self.db.query(DataRecord)
+                .filter(DataRecord.version_id == version.id)
+                .all()
+            )
+
+            needs_reaggregate = False
+
+            for record in records:
+                out_path = (
+                    record.outputs[0]["relative_path"]
+                    if record.outputs
+                    else None
+                )
+                if not out_path:
+                    continue
+
+                output_config = self._find_matching_output(out_path, outputs)
+                if not output_config:
+                    continue
+
+                paired_configs = list(output_config.get("paired_inputs", []))
+                if "paired_input" in output_config:
+                    paired_configs.append(output_config["paired_input"])
+                if not paired_configs:
+                    continue
+
+                existing_input_paths = {
+                    inp["relative_path"] for inp in (record.inputs or [])
+                }
+
+                for pc in paired_configs:
+                    extract_regex = pc.get("extract_regex")
+                    path_template = pc.get("path_template")
+                    if not extract_regex or not path_template:
+                        continue
+
+                    match = re.search(extract_regex, out_path)
+                    if not match:
+                        continue
+
+                    try:
+                        paired_path = path_template.format(**match.groupdict())
+                    except KeyError:
+                        continue
+
+                    # Already present in this record's inputs
+                    if paired_path in existing_input_paths:
+                        continue
+
+                    # DataFile already exists but not yet reflected in record
+                    existing_df = (
+                        self.db.query(DataFile)
+                        .filter(
+                            DataFile.version_id == version.id,
+                            DataFile.relative_path == paired_path,
+                            DataFile.group_key == record.group_key,
+                        )
+                        .first()
+                    )
+                    if existing_df:
+                        needs_reaggregate = True
+                        continue
+
+                    # Fetch from Gogs (with cache)
+                    if paired_path not in file_info_cache:
+                        file_info_cache[paired_path] = (
+                            await self.gogs.get_file_info(
+                                owner, repo_name, paired_path,
+                                ref=current_commit,
+                            )
+                        )
+                    paired_info = file_info_cache[paired_path]
+
+                    if not paired_info:
+                        continue  # still not available
+
+                    await self.storage.process_file_with_sha(
+                        version,
+                        paired_path,
+                        paired_info.get("sha"),
+                        owner,
+                        repo_name,
+                        direction=pc.get("direction", "input"),
+                        label=pc.get("label"),
+                        extract_json_key=pc.get("extract_json_key"),
+                        group_key=record.group_key,
+                        content_ref=paired_info.get("ref", current_commit),
+                    )
+                    needs_reaggregate = True
+                    logger.info(
+                        f"Backfilled paired input {paired_path} "
+                        f"for version {version.id} (commit {version.commit_id[:8]})"
+                    )
+
+            if needs_reaggregate:
+                self.storage.aggregate_version_records(version)
+                logger.info(
+                    f"Re-aggregated version {version.id} after backfilling paired inputs"
+                )
+
     async def _process_outputs(
         self,
         version,

+ 0 - 990
app/static/console.html

@@ -1,990 +0,0 @@
-<!DOCTYPE html>
-<html lang="zh-CN">
-
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Data Nexus - 数据控制台</title>
-    <meta name="description" content="Data Nexus 数据中台管理控制台">
-    <link rel="preconnect" href="https://fonts.googleapis.com">
-    <link
-        href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap"
-        rel="stylesheet">
-    <style>
-        *,
-        *::before,
-        *::after {
-            margin: 0;
-            padding: 0;
-            box-sizing: border-box;
-        }
-
-        :root {
-            --bg-base: #080c18;
-            --bg-sidebar: #0c1222;
-            --bg-surface: #111827;
-            --bg-card: #151f32;
-            --bg-card-head: rgba(0, 0, 0, 0.2);
-            --bg-hover: rgba(255, 255, 255, 0.04);
-            --bg-active: rgba(99, 179, 237, 0.08);
-            --border: rgba(255, 255, 255, 0.06);
-            --border-card: rgba(255, 255, 255, 0.08);
-            --border-active: rgba(99, 179, 237, 0.35);
-            --text-primary: #e2e8f0;
-            --text-secondary: #8b9ab5;
-            --text-muted: #556477;
-            --accent: #63b3ed;
-            --accent-light: #90cdf4;
-            --accent-dim: rgba(99, 179, 237, 0.12);
-            --green: #68d391;
-            --green-dim: rgba(104, 211, 145, 0.12);
-            --orange: #f6ad55;
-            --orange-dim: rgba(246, 173, 85, 0.12);
-            --purple: #b794f4;
-            --radius: 8px;
-            --sidebar-w: 280px;
-        }
-
-        body {
-            font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'PingFang SC', 'Microsoft YaHei', sans-serif;
-            background: var(--bg-base);
-            color: var(--text-primary);
-            height: 100vh;
-            overflow: hidden;
-            line-height: 1.5;
-        }
-
-        ::-webkit-scrollbar {
-            width: 5px;
-        }
-
-        ::-webkit-scrollbar-track {
-            background: transparent;
-        }
-
-        ::-webkit-scrollbar-thumb {
-            background: rgba(255, 255, 255, 0.08);
-            border-radius: 3px;
-        }
-
-        ::-webkit-scrollbar-thumb:hover {
-            background: rgba(255, 255, 255, 0.14);
-        }
-
-        /* Layout */
-        .app {
-            display: grid;
-            grid-template-columns: var(--sidebar-w) 1fr;
-            height: 100vh;
-        }
-
-        /* Sidebar */
-        .sidebar {
-            background: var(--bg-sidebar);
-            border-right: 1px solid var(--border);
-            display: flex;
-            flex-direction: column;
-            overflow: hidden;
-        }
-
-        .sidebar-header {
-            display: flex;
-            align-items: center;
-            gap: 10px;
-            padding: 20px 20px 16px;
-            flex-shrink: 0;
-        }
-
-        .sidebar-header svg {
-            width: 26px;
-            height: 26px;
-            color: var(--accent);
-            flex-shrink: 0;
-        }
-
-        .sidebar-header span {
-            font-size: 16px;
-            font-weight: 700;
-            background: linear-gradient(135deg, var(--accent-light), var(--purple));
-            -webkit-background-clip: text;
-            background-clip: text;
-            -webkit-text-fill-color: transparent;
-        }
-
-        .sidebar-section {
-            padding: 0 16px;
-            flex-shrink: 0;
-        }
-
-        .sidebar-label {
-            display: block;
-            font-size: 11px;
-            font-weight: 600;
-            color: var(--text-muted);
-            text-transform: uppercase;
-            letter-spacing: 0.8px;
-            padding: 12px 4px 8px;
-        }
-
-        .sidebar-divider {
-            height: 1px;
-            background: var(--border);
-            margin: 8px 16px;
-            flex-shrink: 0;
-        }
-
-        /* Project select */
-        .select-wrap {
-            position: relative;
-        }
-
-        .select-wrap::after {
-            content: '';
-            position: absolute;
-            right: 12px;
-            top: 50%;
-            transform: translateY(-50%);
-            border: 4px solid transparent;
-            border-top: 5px solid var(--text-muted);
-            pointer-events: none;
-        }
-
-        .project-select {
-            width: 100%;
-            padding: 9px 32px 9px 12px;
-            background: var(--bg-surface);
-            border: 1px solid var(--border);
-            border-radius: 6px;
-            color: var(--text-primary);
-            font-family: inherit;
-            font-size: 13px;
-            cursor: pointer;
-            appearance: none;
-            outline: none;
-            transition: border-color 0.2s;
-        }
-
-        .project-select:focus {
-            border-color: var(--accent);
-        }
-
-        .project-select option {
-            background: var(--bg-surface);
-            color: var(--text-primary);
-        }
-
-        /* Stage tree */
-        .stage-tree-wrap {
-            flex: 1;
-            overflow-y: auto;
-            padding: 0 8px 16px;
-        }
-
-        .tree-branch-header {
-            display: flex;
-            align-items: center;
-            gap: 4px;
-            padding: 7px 10px;
-            cursor: pointer;
-            border-radius: 6px;
-            transition: background 0.12s;
-            user-select: none;
-            font-size: 13px;
-            color: var(--text-secondary);
-        }
-
-        .tree-branch-header:hover {
-            background: var(--bg-hover);
-        }
-
-        .tree-arrow {
-            width: 16px;
-            height: 16px;
-            color: var(--text-muted);
-            transition: transform 0.2s;
-            flex-shrink: 0;
-        }
-
-        .tree-arrow.open {
-            transform: rotate(90deg);
-        }
-
-        .tree-children {
-            display: none;
-            padding-left: 8px;
-            margin-left: 12px;
-            border-left: 1px solid var(--border);
-        }
-
-        .tree-children.open {
-            display: block;
-        }
-
-        .tree-leaf {
-            display: flex;
-            align-items: center;
-            gap: 8px;
-            padding: 7px 10px 7px 12px;
-            cursor: pointer;
-            border-radius: 6px;
-            transition: all 0.12s;
-            font-size: 13px;
-            color: var(--text-secondary);
-        }
-
-        .tree-leaf:hover {
-            background: var(--bg-hover);
-            color: var(--text-primary);
-        }
-
-        .tree-leaf.active {
-            background: var(--bg-active);
-            color: var(--accent);
-            font-weight: 500;
-        }
-
-        .tree-dot {
-            width: 5px;
-            height: 5px;
-            border-radius: 50%;
-            background: var(--text-muted);
-            flex-shrink: 0;
-        }
-
-        .tree-leaf.active .tree-dot {
-            background: var(--accent);
-        }
-
-        .tree-count {
-            margin-left: auto;
-            font-size: 11px;
-            color: var(--text-muted);
-            background: rgba(255, 255, 255, 0.04);
-            padding: 1px 6px;
-            border-radius: 4px;
-        }
-
-        /* Content */
-        .content {
-            display: flex;
-            flex-direction: column;
-            height: 100vh;
-            overflow: hidden;
-        }
-
-        .content-header {
-            flex-shrink: 0;
-            padding: 18px 28px;
-            border-bottom: 1px solid var(--border);
-            background: rgba(12, 18, 34, 0.6);
-            backdrop-filter: blur(12px);
-            display: flex;
-            align-items: center;
-            justify-content: space-between;
-            min-height: 60px;
-        }
-
-        .stage-path {
-            display: flex;
-            align-items: center;
-            gap: 6px;
-            font-size: 14px;
-        }
-
-        .stage-path .sep {
-            color: var(--text-muted);
-            font-size: 11px;
-        }
-
-        .stage-path .seg {
-            color: var(--text-secondary);
-        }
-
-        .stage-path .seg:last-child {
-            color: var(--text-primary);
-            font-weight: 600;
-        }
-
-        .header-info {
-            font-size: 12px;
-            color: var(--text-muted);
-        }
-
-        .content-body {
-            flex: 1;
-            overflow-y: auto;
-            padding: 24px 28px;
-        }
-
-        /* Welcome state */
-        .state-box {
-            display: flex;
-            flex-direction: column;
-            align-items: center;
-            justify-content: center;
-            height: 100%;
-            text-align: center;
-            color: var(--text-muted);
-            padding: 40px;
-        }
-
-        .state-box svg {
-            width: 56px;
-            height: 56px;
-            margin-bottom: 20px;
-            opacity: 0.25;
-        }
-
-        .state-box h2 {
-            font-size: 17px;
-            font-weight: 600;
-            color: var(--text-secondary);
-            margin-bottom: 6px;
-        }
-
-        .state-box p {
-            font-size: 13px;
-        }
-
-        /* Spinner */
-        .spinner {
-            width: 28px;
-            height: 28px;
-            border: 3px solid var(--border);
-            border-top-color: var(--accent);
-            border-radius: 50%;
-            animation: spin 0.7s linear infinite;
-            margin-bottom: 16px;
-        }
-
-        @keyframes spin {
-            to {
-                transform: rotate(360deg);
-            }
-        }
-
-        /* Version card */
-        .version-card {
-            background: var(--bg-card);
-            border: 1px solid var(--border-card);
-            border-radius: var(--radius);
-            overflow: hidden;
-            margin-bottom: 16px;
-            animation: fadeUp 0.35s ease forwards;
-            opacity: 0;
-        }
-
-        .version-card:hover {
-            border-color: rgba(255, 255, 255, 0.1);
-        }
-
-        .version-head {
-            display: flex;
-            align-items: center;
-            gap: 14px;
-            padding: 14px 20px;
-            border-bottom: 1px solid var(--border);
-            background: var(--bg-card-head);
-            flex-wrap: wrap;
-        }
-
-        .v-author {
-            font-size: 13px;
-            color: var(--text-secondary);
-        }
-
-        .v-time {
-            margin-left: auto;
-            font-size: 12px;
-            color: var(--text-muted);
-        }
-
-        /* File row grid */
-        .file-row {
-            display: grid;
-            grid-template-columns: 1fr 100px;
-            gap: 16px;
-            align-items: center;
-            padding: 12px 20px;
-            border-bottom: 1px solid var(--border);
-            transition: background 0.1s;
-        }
-
-        .file-row:last-child {
-            border-bottom: none;
-        }
-
-        .file-row:hover {
-            background: var(--bg-hover);
-        }
-
-        .file-name-col {
-            display: flex;
-            align-items: center;
-            gap: 10px;
-            min-width: 0;
-            /* allows text truncation */
-        }
-
-        @keyframes fadeUp {
-            from {
-                opacity: 0;
-                transform: translateY(10px);
-            }
-
-            to {
-                opacity: 1;
-                transform: translateY(0);
-            }
-        }
-
-        .f-icon {
-            width: 18px;
-            height: 18px;
-            flex-shrink: 0;
-            color: var(--text-muted);
-        }
-
-        .f-name {
-            font-size: 13px;
-            color: var(--text-primary);
-            white-space: nowrap;
-            overflow: hidden;
-            text-overflow: ellipsis;
-        }
-
-        .f-size {
-            font-size: 11px;
-            color: var(--text-muted);
-            margin-left: 6px;
-        }
-
-        .col-text {
-            font-size: 13px;
-            color: var(--text-secondary);
-            white-space: nowrap;
-            overflow: hidden;
-            text-overflow: ellipsis;
-        }
-
-        .commit-tag {
-            display: inline-flex;
-            align-items: center;
-            gap: 5px;
-            font-family: 'JetBrains Mono', monospace;
-            font-size: 13px;
-            font-weight: 500;
-            color: var(--accent);
-            background: var(--accent-dim);
-            padding: 3px 10px;
-            border-radius: 5px;
-        }
-
-        .commit-tag svg {
-            width: 14px;
-            height: 14px;
-        }
-
-        .btn-dl-wrap {
-            display: flex;
-            justify-content: flex-end;
-        }
-
-        .btn-dl {
-            display: inline-flex;
-            align-items: center;
-            gap: 4px;
-            padding: 4px 10px;
-            border-radius: 5px;
-            border: none;
-            background: var(--accent-dim);
-            color: var(--accent);
-            font-size: 12px;
-            font-family: inherit;
-            cursor: pointer;
-            transition: all 0.15s;
-            text-decoration: none;
-            flex-shrink: 0;
-        }
-
-        .btn-dl:hover {
-            background: rgba(99, 179, 237, 0.2);
-        }
-
-        .btn-dl svg {
-            width: 13px;
-            height: 13px;
-        }
-
-        /* File group (folder) */
-        .fg-header {
-            display: grid;
-            grid-template-columns: 1fr 100px;
-            gap: 16px;
-            align-items: center;
-            padding: 12px 20px;
-            border-bottom: 1px solid var(--border);
-            cursor: pointer;
-            transition: background 0.1s;
-            user-select: none;
-        }
-
-        .fg-header:hover {
-            background: var(--bg-hover);
-        }
-
-        .fg-name-wrap {
-            display: flex;
-            align-items: center;
-            gap: 8px;
-            min-width: 0;
-        }
-
-        .fg-header:hover {
-            background: var(--bg-hover);
-        }
-
-        .fg-arrow {
-            width: 14px;
-            height: 14px;
-            color: var(--text-muted);
-            transition: transform 0.2s;
-            flex-shrink: 0;
-        }
-
-        .fg-arrow.open {
-            transform: rotate(90deg);
-        }
-
-        .fg-icon {
-            width: 18px;
-            height: 18px;
-            flex-shrink: 0;
-            color: var(--orange);
-        }
-
-        .fg-name {
-            font-size: 13px;
-            color: var(--text-primary);
-            font-weight: 500;
-        }
-
-        .fg-count {
-            font-size: 11px;
-            color: var(--text-muted);
-            background: rgba(255, 255, 255, 0.04);
-            padding: 1px 7px;
-            border-radius: 4px;
-            margin-left: 4px;
-        }
-
-        .fg-children {
-            display: none;
-        }
-
-        .fg-children.open {
-            display: block;
-        }
-
-        /* Load more */
-        .load-more {
-            display: flex;
-            justify-content: center;
-            padding: 16px;
-        }
-
-        .load-more-btn {
-            padding: 9px 28px;
-            border-radius: 6px;
-            border: 1px solid var(--border);
-            background: transparent;
-            color: var(--text-secondary);
-            font-family: inherit;
-            font-size: 13px;
-            cursor: pointer;
-            transition: all 0.2s;
-        }
-
-        .load-more-btn:hover {
-            background: var(--bg-hover);
-            color: var(--text-primary);
-            border-color: var(--border-active);
-        }
-
-        .load-more-btn:disabled {
-            opacity: 0.4;
-            cursor: default;
-        }
-    </style>
-</head>
-
-<body>
-    <div class="app">
-        <aside class="sidebar">
-            <div class="sidebar-header">
-                <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
-                    <path
-                        d="M21 16V8a2 2 0 00-1-1.73l-7-4a2 2 0 00-2 0l-7 4A2 2 0 003 8v8a2 2 0 001 1.73l7 4a2 2 0 002 0l7-4A2 2 0 0021 16z" />
-                    <polyline points="3.27 6.96 12 12.01 20.73 6.96" />
-                    <line x1="12" y1="22.08" x2="12" y2="12" />
-                </svg>
-                <span>Data Nexus</span>
-            </div>
-            <div class="sidebar-divider"></div>
-            <div class="stage-tree-wrap" id="stageTreeWrap"></div>
-        </aside>
-        <main class="content">
-            <div class="content-header">
-                <div class="stage-path" id="stagePath"><span class="seg" style="color:var(--text-muted)">选择左侧数据阶段</span>
-                </div>
-                <div class="header-info" id="headerInfo"></div>
-            </div>
-            <div class="content-body" id="contentBody">
-                <div class="state-box">
-                    <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5">
-                        <path
-                            d="M21 16V8a2 2 0 00-1-1.73l-7-4a2 2 0 00-2 0l-7 4A2 2 0 003 8v8a2 2 0 001 1.73l7 4a2 2 0 002 0l7-4A2 2 0 0021 16z" />
-                        <polyline points="3.27 6.96 12 12.01 20.73 6.96" />
-                        <line x1="12" y1="22.08" x2="12" y2="12" />
-                    </svg>
-                    <h2>欢迎使用数据控制台</h2>
-                    <p>从左侧选择数据阶段,查看文件版本历史</p>
-                </div>
-            </div>
-        </main>
-    </div>
-
-    <script>
-        // ============ Icons ============
-        const IC = {
-            commit: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><circle cx="12" cy="12" r="4"/><line x1="1.05" y1="12" x2="7" y2="12"/><line x1="17.01" y1="12" x2="22.96" y2="12"/></svg>',
-            file: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><polyline points="14 2 14 8 20 8"/></svg>',
-            folder: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M22 19a2 2 0 01-2 2H4a2 2 0 01-2-2V5a2 2 0 012-2h5l2 3h9a2 2 0 012 2z"/></svg>',
-            download: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M21 15v4a2 2 0 01-2 2H5a2 2 0 01-2-2v-4"/><polyline points="7 10 12 15 17 10"/><line x1="12" y1="15" x2="12" y2="3"/></svg>',
-            chevron: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><polyline points="9 18 15 12 9 6"/></svg>',
-        };
-
-        // ============ State ============
-        const PAGE_SIZE = 20;
-        let S = {
-            stages: [],       // raw stage data from /stages/all
-            stageProjectMap: {}, // stage name -> project_id
-            stage: null,
-            versions: [],
-            skip: 0,
-            hasMore: true,
-            loading: false,
-        };
-
-        const $ = id => document.getElementById(id);
-
-        // ============ Utils ============
-        function esc(s) { if (!s) return ''; const d = document.createElement('div'); d.textContent = s; return d.innerHTML; }
-        function fmtSize(b) {
-            if (!b && b !== 0) return '-';
-            const u = ['B', 'KB', 'MB', 'GB']; let i = 0, s = b;
-            while (s >= 1024 && i < u.length - 1) { s /= 1024; i++; }
-            return s.toFixed(i > 0 ? 1 : 0) + ' ' + u[i];
-        }
-        function fmtTime(iso) {
-            if (!iso) return '';
-            const d = new Date(iso), p = n => String(n).padStart(2, '0');
-            return `${d.getFullYear()}-${p(d.getMonth() + 1)}-${p(d.getDate())} ${p(d.getHours())}:${p(d.getMinutes())}`;
-        }
-        function relTime(iso) {
-            if (!iso) return '';
-            const m = Math.floor((Date.now() - new Date(iso).getTime()) / 60000);
-            if (m < 1) return '刚刚'; if (m < 60) return m + ' 分钟前';
-            const h = Math.floor(m / 60); if (h < 24) return h + ' 小时前';
-            const d = Math.floor(h / 24); if (d < 30) return d + ' 天前';
-            return fmtTime(iso);
-        }
-        async function api(url) { const r = await fetch(url); if (!r.ok) throw new Error(r.status); return r.json(); }
-
-        // ============ Load All Stages ============
-        async function loadAllStages() {
-            $('stageTreeWrap').innerHTML = '<div style="padding:16px;text-align:center;"><div class="spinner" style="margin:0 auto 8px;"></div><span style="font-size:12px;color:var(--text-muted)">加载中...</span></div>';
-            try {
-                S.stages = await api('/stages/all');
-                // Build stage -> project_id mapping
-                S.stageProjectMap = {};
-                S.stages.forEach(st => { S.stageProjectMap[st.name] = st.project_id; });
-                renderStageTree();
-            } catch (e) { $('stageTreeWrap').innerHTML = '<div style="padding:16px;color:#fc8181;font-size:13px;">加载失败</div>'; }
-        }
-
-        // ============ Stage Tree ============
-        function buildTree(stages) {
-            const root = [];
-            for (const st of stages) {
-                const parts = st.name.split('/');
-                let cur = root;
-                for (let i = 0; i < parts.length; i++) {
-                    let node = cur.find(n => n.label === parts[i]);
-                    if (!node) {
-                        node = { label: parts[i], children: [] };
-                        cur.push(node);
-                    }
-                    if (i === parts.length - 1) {
-                        node.stage = st.name;
-                        node.count = st.version_count;
-                    }
-                    cur = node.children;
-                }
-            }
-            return root;
-        }
-
-        function renderStageTree() {
-            const tree = buildTree(S.stages);
-            $('stageTreeWrap').innerHTML = tree.length ? renderNodes(tree) : '<div style="padding:16px;font-size:13px;color:var(--text-muted)">暂无数据阶段</div>';
-        }
-
-        function renderNodes(nodes) {
-            let h = '';
-            for (const n of nodes) {
-                if (n.stage && n.children.length === 0) {
-                    // Leaf
-                    h += `<div class="tree-leaf" data-stage="${esc(n.stage)}" onclick="selectStage(this, '${esc(n.stage)}')">
-                <span class="tree-dot"></span>
-                <span>${esc(n.label)}</span>
-                <span class="tree-count">${n.count || ''}</span>
-            </div>`;
-                } else {
-                    // Branch (may also be a stage itself)
-                    const id = 'tb_' + Math.random().toString(36).substr(2, 6);
-                    h += `<div>
-                <div class="tree-branch-header" onclick="toggleBranch('${id}', this)">
-                    <span class="tree-arrow" id="a_${id}">${IC.chevron}</span>
-                    <span>${esc(n.label)}</span>
-                </div>
-                <div class="tree-children" id="${id}">${renderNodes(n.children)}</div>
-            </div>`;
-                }
-            }
-            return h;
-        }
-
-        function toggleBranch(id, el) {
-            const ch = $(id), ar = $('a_' + id);
-            if (ch) ch.classList.toggle('open');
-            if (ar) ar.classList.toggle('open');
-        }
-
-        function selectStage(el, stageName) {
-            // Highlight
-            document.querySelectorAll('.tree-leaf.active').forEach(e => e.classList.remove('active'));
-            el.classList.add('active');
-            S.stage = stageName;
-            S.versions = []; S.skip = 0; S.hasMore = true;
-            updateHeader();
-            loadVersions();
-        }
-
-        // ============ Header ============
-        function updateHeader() {
-            if (!S.stage) {
-                $('stagePath').innerHTML = '<span class="seg" style="color:var(--text-muted)">选择左侧数据阶段</span>';
-                $('headerInfo').textContent = '';
-                return;
-            }
-            const parts = S.stage.split('/');
-            $('stagePath').innerHTML = parts.map((p, i) =>
-                `${i > 0 ? '<span class="sep">/</span>' : ''}<span class="seg">${esc(p)}</span>`
-            ).join('');
-        }
-
-        // ============ Versions ============
-        async function loadVersions(append) {
-            if (S.loading) return;
-            S.loading = true;
-            if (!append) {
-                $('contentBody').innerHTML = '<div class="state-box"><div class="spinner"></div><p>加载中...</p></div>';
-            }
-            try {
-                const pid = S.stageProjectMap[S.stage];
-                const data = await api(`/projects/${pid}/stage-files?stage=${encodeURIComponent(S.stage)}&skip=${S.skip}&limit=${PAGE_SIZE}`);
-                if (!append) S.versions = [];
-                S.versions.push(...data);
-                S.hasMore = data.length >= PAGE_SIZE;
-                S.skip += data.length;
-                renderVersions();
-                const stageInfo = S.stages.find(s => s.name === S.stage);
-                $('headerInfo').textContent = stageInfo ? `共 ${stageInfo.version_count} 次提交` : '';
-            } catch (e) {
-                if (!append) $('contentBody').innerHTML = '<div class="state-box"><p style="color:#fc8181;">加载失败: ' + esc(e.message) + '</p></div>';
-            }
-            S.loading = false;
-        }
-
-        function renderVersions() {
-            if (!S.versions.length) {
-                $('contentBody').innerHTML = '<div class="state-box"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><polyline points="14 2 14 8 20 8"/></svg><h2>暂无数据</h2><p>该阶段还没有提交记录</p></div>';
-                return;
-            }
-            let h = '';
-
-            S.versions.forEach((v, i) => {
-                const tree = buildFileTree(v.files);
-                h += `<div class="version-card" style="animation-delay:${Math.min(i, 10) * 0.05}s">
-            <div class="version-head">
-                <span class="commit-tag">${IC.commit} ${esc(v.commit_id.substring(0, 8))}</span>
-                <span class="v-author">${v.author ? esc(v.author) : ''}</span>
-                <span class="v-time" title="${fmtTime(v.created_at)}">${relTime(v.created_at)}</span>
-            </div>
-            <div class="version-files">${renderTree(tree, v, 0)}</div>
-        </div>`;
-            });
-            if (S.hasMore) {
-                h += '<div class="load-more"><button class="load-more-btn" onclick="loadMore()">加载更多</button></div>';
-            }
-            $('contentBody').innerHTML = h;
-        }
-
-        function loadMore() {
-            const btn = document.querySelector('.load-more-btn');
-            if (btn) { btn.disabled = true; btn.textContent = '加载中...'; }
-            loadVersions(true);
-        }
-
-        // ============ File Grouping ============
-        function countFiles(node) {
-            let cnt = node.files.length;
-            Object.values(node.dirs).forEach(d => { cnt += countFiles(d); });
-            return cnt;
-        }
-
-        function buildFileTree(files) {
-            const root = { dirs: {}, files: [], path: '' };
-            if (!files || !files.length) return root;
-
-            files.forEach(f => {
-                const parts = f.relative_path.split('/');
-                let cur = root;
-                for (let i = 0; i < parts.length - 1; i++) {
-                    const p = parts[i];
-                    if (!cur.dirs[p]) {
-                        const curPath = cur.path ? cur.path + '/' + p : p;
-                        cur.dirs[p] = { name: p, path: curPath, dirs: {}, files: [] };
-                    }
-                    cur = cur.dirs[p];
-                }
-                cur.files.push(f);
-            });
-
-            function compact(node) {
-                const dirKeys = Object.keys(node.dirs);
-                dirKeys.forEach(k => {
-                    compact(node.dirs[k]);
-                });
-
-                Object.keys(node.dirs).forEach(k => {
-                    let child = node.dirs[k];
-                    if (!child) return;
-
-                    let changed = true;
-                    while (changed) {
-                        changed = false;
-
-                        if (Object.keys(child.dirs).length === 1 && child.files.length === 0) {
-                            const onlyChildKey = Object.keys(child.dirs)[0];
-                            const onlyChild = child.dirs[onlyChildKey];
-
-                            child.name = child.name + '/' + onlyChild.name;
-                            child.path = onlyChild.path;
-                            child.dirs = onlyChild.dirs;
-                            child.files = onlyChild.files;
-                            changed = true;
-                        }
-
-                        if (Object.keys(child.dirs).length === 0 && child.files.length === 1) {
-                            node.files.push(child.files[0]);
-                            delete node.dirs[k];
-                            // since child is deleted, break inner loops
-                            changed = false;
-                        }
-                    }
-                });
-            }
-            compact(root);
-
-            return root;
-        }
-
-        function renderTree(node, version, depth) {
-            let h = '';
-
-            const dirKeys = Object.keys(node.dirs).sort((a, b) => a.localeCompare(b));
-            dirKeys.forEach(k => {
-                const d = node.dirs[k];
-                const gid = 'fg_' + Math.random().toString(36).substr(2, 6);
-                const fileCount = countFiles(d);
-                const padding = `padding-left: ${20 + depth * 24}px;`;
-
-                h += `
-            <div class="fg-header" style="${padding}" onclick="toggleFG('${gid}')">
-                <div class="fg-name-wrap">
-                    <span class="fg-arrow" id="fa_${gid}">${IC.chevron}</span>
-                    <span class="fg-icon">${IC.folder}</span>
-                    <span class="fg-name" title="${esc(d.path)}">${esc(d.name)}/</span>
-                    <span class="fg-count">${fileCount} 个文件</span>
-                </div>
-                <div></div>
-            </div>
-            <div class="fg-children" id="${gid}">
-                ${renderTree(d, version, depth + 1)}
-            </div>`;
-            });
-
-            node.files.sort((a, b) => a.relative_path.localeCompare(b.relative_path)).forEach(f => {
-                let displayName = f.relative_path || f.name;
-                if (node.path && f.relative_path.startsWith(node.path + '/')) {
-                    displayName = f.relative_path.substring(node.path.length + 1);
-                }
-                const padding = `padding-left: ${depth === 0 ? 20 : 44 + (depth - 1) * 24}px;`;
-
-                h += `
-    <div class="file-row" style="${padding}">
-        <div class="file-name-col" title="${esc(f.relative_path)}">
-            <span class="f-icon">${IC.file}</span>
-            <span class="f-name">${esc(displayName)}</span>
-            <span class="f-size">${fmtSize(f.file_size)}</span>
-        </div>
-        <div class="btn-dl-wrap">
-            <a class="btn-dl" href="/files/${f.id}/content" download="${esc(f.name)}" onclick="event.stopPropagation();">${IC.download}</a>
-        </div>
-    </div>`;
-            });
-
-            if (depth === 0 && h === '') {
-                return '<div style="padding:14px 20px;font-size:13px;color:var(--text-muted)">暂无文件</div>';
-            }
-            return h;
-        }
-
-        function toggleFG(id) {
-            const ch = $(id), ar = $('fa_' + id);
-            if (ch) ch.classList.toggle('open');
-            if (ar) ar.classList.toggle('open');
-        }
-
-        // ============ UI Helpers ============
-        function showWelcome() {
-            $('stagePath').innerHTML = '<span class="seg" style="color:var(--text-muted)">选择左侧数据阶段</span>';
-            $('headerInfo').textContent = '';
-            $('contentBody').innerHTML = `<div class="state-box">
-        <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5">
-            <path d="M21 16V8a2 2 0 00-1-1.73l-7-4a2 2 0 00-2 0l-7 4A2 2 0 003 8v8a2 2 0 001 1.73l7 4a2 2 0 002 0l7-4A2 2 0 0021 16z"/>
-            <polyline points="3.27 6.96 12 12.01 20.73 6.96"/><line x1="12" y1="22.08" x2="12" y2="12"/>
-        </svg>
-        <h2>欢迎使用数据控制台</h2>
-        <p>从左侧选择数据阶段,查看文件版本历史</p>
-    </div>`;
-        }
-
-        // ============ Init ============
-        loadAllStages();
-    </script>
-</body>
-
-</html>

+ 0 - 712
app/static/index.html

@@ -1,712 +0,0 @@
-<!DOCTYPE html>
-<html lang="zh-CN">
-
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Data Nexus - 数据中台</title>
-    <meta name="description" content="Data Nexus 轻量级数据中台,浏览和下载项目数据文件">
-    <style>
-        *,
-        *::before,
-        *::after {
-            margin: 0;
-            padding: 0;
-            box-sizing: border-box;
-        }
-
-        :root {
-            --bg-primary: #0f172a;
-            --bg-surface: #1e293b;
-            --bg-hover: #334155;
-            --bg-card: rgba(30, 41, 59, 0.7);
-            --border: rgba(148, 163, 184, 0.1);
-            --border-hover: rgba(59, 130, 246, 0.4);
-            --text-primary: #f1f5f9;
-            --text-secondary: #94a3b8;
-            --text-muted: #64748b;
-            --accent: #3b82f6;
-            --accent-light: #60a5fa;
-            --accent-glow: rgba(59, 130, 246, 0.15);
-            --green: #22c55e;
-            --orange: #f59e0b;
-            --purple: #a78bfa;
-            --radius: 10px;
-        }
-
-        body {
-            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'PingFang SC', 'Microsoft YaHei', sans-serif;
-            background: var(--bg-primary);
-            color: var(--text-primary);
-            min-height: 100vh;
-            line-height: 1.6;
-        }
-
-        /* Header */
-        .header {
-            background: linear-gradient(135deg, rgba(15, 23, 42, 0.95), rgba(30, 41, 59, 0.9));
-            border-bottom: 1px solid var(--border);
-            backdrop-filter: blur(20px);
-            position: sticky;
-            top: 0;
-            z-index: 100;
-        }
-
-        .header-inner {
-            max-width: 1200px;
-            margin: 0 auto;
-            padding: 16px 24px;
-        }
-
-        .header-top {
-            display: flex;
-            align-items: center;
-            gap: 12px;
-            margin-bottom: 12px;
-        }
-
-        .logo-icon {
-            width: 32px;
-            height: 32px;
-            color: var(--accent);
-        }
-
-        .header-top h1 {
-            font-size: 20px;
-            font-weight: 700;
-            background: linear-gradient(135deg, var(--accent-light), var(--purple));
-            -webkit-background-clip: text;
-            background-clip: text;
-            -webkit-text-fill-color: transparent;
-        }
-
-        /* Breadcrumb */
-        .breadcrumb {
-            display: flex;
-            align-items: center;
-            gap: 4px;
-            flex-wrap: wrap;
-            font-size: 14px;
-        }
-
-        .breadcrumb-item {
-            color: var(--text-muted);
-            cursor: pointer;
-            padding: 2px 8px;
-            border-radius: 4px;
-            transition: all 0.2s;
-        }
-
-        .breadcrumb-item:hover {
-            color: var(--accent-light);
-            background: var(--accent-glow);
-        }
-
-        .breadcrumb-item.active {
-            color: var(--text-primary);
-            cursor: default;
-        }
-
-        .breadcrumb-item.active:hover {
-            background: none;
-            color: var(--text-primary);
-        }
-
-        .breadcrumb-sep {
-            color: var(--text-muted);
-            font-size: 12px;
-        }
-
-        .breadcrumb-sep svg {
-            width: 14px;
-            height: 14px;
-            vertical-align: middle;
-        }
-
-        /* Main content */
-        .main {
-            max-width: 1200px;
-            margin: 0 auto;
-            padding: 24px;
-        }
-
-        /* Item list */
-        .item-list {
-            background: var(--bg-card);
-            border: 1px solid var(--border);
-            border-radius: var(--radius);
-            overflow: hidden;
-            backdrop-filter: blur(10px);
-        }
-
-        .item {
-            display: flex;
-            align-items: center;
-            gap: 14px;
-            padding: 14px 20px;
-            border-bottom: 1px solid var(--border);
-            cursor: pointer;
-            transition: all 0.2s ease;
-            animation: fadeSlideIn 0.3s ease forwards;
-            opacity: 0;
-        }
-
-        .item:last-child {
-            border-bottom: none;
-        }
-
-        .item:hover {
-            background: var(--bg-hover);
-            border-color: var(--border-hover);
-        }
-
-        .item-icon {
-            flex-shrink: 0;
-            width: 20px;
-            height: 20px;
-        }
-
-        .item-icon svg {
-            width: 20px;
-            height: 20px;
-        }
-
-        .item-icon.project {
-            color: var(--accent-light);
-        }
-
-        .item-icon.stage {
-            color: var(--orange);
-        }
-
-        .item-icon.commit {
-            color: var(--green);
-        }
-
-        .item-icon.file {
-            color: var(--text-muted);
-        }
-
-        .item-icon.folder {
-            color: var(--orange);
-        }
-
-        .item-body {
-            flex: 1;
-            min-width: 0;
-        }
-
-        .item-name {
-            font-size: 15px;
-            font-weight: 500;
-            color: var(--text-primary);
-            white-space: nowrap;
-            overflow: hidden;
-            text-overflow: ellipsis;
-        }
-
-        .item-desc {
-            font-size: 12px;
-            color: var(--text-muted);
-            margin-top: 2px;
-            white-space: nowrap;
-            overflow: hidden;
-            text-overflow: ellipsis;
-        }
-
-        .item-meta {
-            flex-shrink: 0;
-            text-align: right;
-            font-size: 12px;
-            color: var(--text-muted);
-        }
-
-        .item-meta .meta-main {
-            color: var(--text-secondary);
-            font-size: 13px;
-        }
-
-        .item-arrow {
-            flex-shrink: 0;
-            color: var(--text-muted);
-            transition: transform 0.2s;
-        }
-
-        .item:hover .item-arrow {
-            color: var(--accent-light);
-            transform: translateX(3px);
-        }
-
-        .item-arrow svg {
-            width: 16px;
-            height: 16px;
-        }
-
-        /* Download button */
-        .btn-download {
-            display: inline-flex;
-            align-items: center;
-            gap: 6px;
-            padding: 6px 14px;
-            border-radius: 6px;
-            border: none;
-            background: var(--accent);
-            color: white;
-            font-size: 12px;
-            font-weight: 500;
-            cursor: pointer;
-            transition: all 0.2s;
-            text-decoration: none;
-        }
-
-        .btn-download:hover {
-            background: var(--accent-light);
-            transform: translateY(-1px);
-        }
-
-        .btn-download svg {
-            width: 14px;
-            height: 14px;
-        }
-
-        /* File tree */
-        .file-tree {
-            padding: 8px 0;
-        }
-
-        .tree-item {
-            display: flex;
-            align-items: center;
-            gap: 10px;
-            padding: 10px 20px;
-            border-bottom: 1px solid var(--border);
-            transition: background 0.15s;
-            animation: fadeSlideIn 0.3s ease forwards;
-            opacity: 0;
-        }
-
-        .tree-item:last-child {
-            border-bottom: none;
-        }
-
-        .tree-item:hover {
-            background: var(--bg-hover);
-        }
-
-        .tree-item .indent {
-            flex-shrink: 0;
-        }
-
-        .tree-item .item-icon {
-            flex-shrink: 0;
-        }
-
-        .tree-item .item-body {
-            flex: 1;
-            min-width: 0;
-        }
-
-        .tree-item .item-name {
-            font-size: 14px;
-        }
-
-        .tree-folder-toggle {
-            cursor: pointer;
-            color: var(--text-muted);
-            width: 16px;
-            height: 16px;
-            flex-shrink: 0;
-            transition: transform 0.2s;
-        }
-
-        .tree-folder-toggle.expanded {
-            transform: rotate(90deg);
-        }
-
-        .tree-children {
-            display: none;
-        }
-
-        .tree-children.expanded {
-            display: block;
-        }
-
-        /* Status badges */
-        .badge {
-            display: inline-block;
-            padding: 2px 8px;
-            border-radius: 4px;
-            font-size: 11px;
-            font-weight: 600;
-        }
-
-        .badge-stage {
-            background: rgba(245, 158, 11, 0.15);
-            color: var(--orange);
-            border: 1px solid rgba(245, 158, 11, 0.25);
-        }
-
-        .badge-count {
-            background: rgba(59, 130, 246, 0.15);
-            color: var(--accent-light);
-            border: 1px solid rgba(59, 130, 246, 0.2);
-        }
-
-        /* Loading & Empty states */
-        .state-container {
-            display: flex;
-            flex-direction: column;
-            align-items: center;
-            justify-content: center;
-            padding: 80px 20px;
-            color: var(--text-muted);
-        }
-
-        .state-container svg {
-            width: 48px;
-            height: 48px;
-            margin-bottom: 16px;
-            opacity: 0.5;
-        }
-
-        .state-container p {
-            font-size: 15px;
-        }
-
-        .spinner {
-            width: 32px;
-            height: 32px;
-            border: 3px solid var(--border);
-            border-top-color: var(--accent);
-            border-radius: 50%;
-            animation: spin 0.8s linear infinite;
-            margin-bottom: 16px;
-        }
-
-        /* Section header */
-        .section-header {
-            padding: 12px 20px;
-            background: rgba(15, 23, 42, 0.5);
-            border-bottom: 1px solid var(--border);
-            font-size: 12px;
-            color: var(--text-muted);
-            font-weight: 600;
-            text-transform: uppercase;
-            letter-spacing: 0.5px;
-        }
-
-        /* Animations */
-        @keyframes fadeSlideIn {
-            from {
-                opacity: 0;
-                transform: translateY(8px);
-            }
-
-            to {
-                opacity: 1;
-                transform: translateY(0);
-            }
-        }
-
-        @keyframes spin {
-            to {
-                transform: rotate(360deg);
-            }
-        }
-
-        /* Responsive */
-        @media (max-width: 640px) {
-
-            .header-inner,
-            .main {
-                padding-left: 16px;
-                padding-right: 16px;
-            }
-
-            .item {
-                padding: 12px 14px;
-                gap: 10px;
-            }
-
-            .item-meta {
-                display: none;
-            }
-        }
-    </style>
-</head>
-
-<body>
-    <header class="header">
-        <div class="header-inner">
-            <div class="header-top">
-                <svg class="logo-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
-                    <path
-                        d="M21 16V8a2 2 0 00-1-1.73l-7-4a2 2 0 00-2 0l-7 4A2 2 0 003 8v8a2 2 0 001 1.73l7 4a2 2 0 002 0l7-4A2 2 0 0021 16z" />
-                    <polyline points="3.27 6.96 12 12.01 20.73 6.96" />
-                    <line x1="12" y1="22.08" x2="12" y2="12" />
-                </svg>
-                <h1>Data Nexus</h1>
-            </div>
-            <nav id="breadcrumb" class="breadcrumb"></nav>
-        </div>
-    </header>
-    <main class="main">
-        <div id="content"></div>
-    </main>
-
-    <script>
-        // ============ SVG Icons ============
-        const SVG = {
-            project: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M21 16V8a2 2 0 00-1-1.73l-7-4a2 2 0 00-2 0l-7 4A2 2 0 003 8v8a2 2 0 001 1.73l7 4a2 2 0 002 0l7-4A2 2 0 0021 16z"/><polyline points="3.27 6.96 12 12.01 20.73 6.96"/><line x1="12" y1="22.08" x2="12" y2="12"/></svg>',
-            folder: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M22 19a2 2 0 01-2 2H4a2 2 0 01-2-2V5a2 2 0 012-2h5l2 3h9a2 2 0 012 2z"/></svg>',
-            commit: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><circle cx="12" cy="12" r="4"/><line x1="1.05" y1="12" x2="7" y2="12"/><line x1="17.01" y1="12" x2="22.96" y2="12"/></svg>',
-            file: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><polyline points="14 2 14 8 20 8"/></svg>',
-            download: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M21 15v4a2 2 0 01-2 2H5a2 2 0 01-2-2v-4"/><polyline points="7 10 12 15 17 10"/><line x1="12" y1="15" x2="12" y2="3"/></svg>',
-            chevron: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><polyline points="9 18 15 12 9 6"/></svg>',
-            home: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M3 9l9-7 9 7v11a2 2 0 01-2 2H5a2 2 0 01-2-2z"/><polyline points="9 22 9 12 15 12 15 22"/></svg>',
-            empty: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5"><path d="M13 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V9z"/><path d="M13 2v7h7"/></svg>',
-        };
-
-        const chevronSep = `<span class="breadcrumb-sep">${SVG.chevron}</span>`;
-
-        // ============ State ============
-        let state = {
-            project: null,
-            stage: null,
-            version: null,
-            versionsCache: [],  // all versions for current project
-        };
-
-        const $content = document.getElementById('content');
-        const $breadcrumb = document.getElementById('breadcrumb');
-
-        // ============ Utils ============
-        function formatSize(bytes) {
-            if (!bytes && bytes !== 0) return '-';
-            const u = ['B', 'KB', 'MB', 'GB', 'TB'];
-            let i = 0, s = bytes;
-            while (s >= 1024 && i < u.length - 1) { s /= 1024; i++; }
-            return s.toFixed(i > 0 ? 1 : 0) + ' ' + u[i];
-        }
-
-        function formatTime(iso) {
-            if (!iso) return '';
-            const d = new Date(iso);
-            const pad = n => String(n).padStart(2, '0');
-            return `${d.getFullYear()}-${pad(d.getMonth() + 1)}-${pad(d.getDate())} ${pad(d.getHours())}:${pad(d.getMinutes())}`;
-        }
-
-        function relativeTime(iso) {
-            if (!iso) return '';
-            const diff = Date.now() - new Date(iso).getTime();
-            const m = Math.floor(diff / 60000);
-            if (m < 1) return '刚刚';
-            if (m < 60) return m + ' 分钟前';
-            const h = Math.floor(m / 60);
-            if (h < 24) return h + ' 小时前';
-            const d = Math.floor(h / 24);
-            if (d < 30) return d + ' 天前';
-            return formatTime(iso);
-        }
-
-        function staggerDelay(i) {
-            return `animation-delay: ${i * 0.04}s;`;
-        }
-
-        async function api(path) {
-            const res = await fetch(path);
-            if (!res.ok) throw new Error(`API Error: ${res.status}`);
-            return res.json();
-        }
-
-        // ============ Rendering ============
-        function renderLoading() {
-            $content.innerHTML = `<div class="item-list"><div class="state-container"><div class="spinner"></div><p>加载中...</p></div></div>`;
-        }
-
-        function renderEmpty(msg) {
-            $content.innerHTML = `<div class="item-list"><div class="state-container">${SVG.empty}<p>${msg}</p></div></div>`;
-        }
-
-        function renderError(msg) {
-            $content.innerHTML = `<div class="item-list"><div class="state-container"><p style="color:#ef4444;">⚠ ${msg}</p><p style="margin-top:8px;font-size:13px;cursor:pointer;color:var(--accent);" onclick="loadProjects()">点击重试</p></div></div>`;
-        }
-
-        // ============ Breadcrumb ============
-        function updateBreadcrumb() {
-            let html = `<span class="breadcrumb-item ${!state.project ? 'active' : ''}" onclick="loadProjects()">所有项目</span>`;
-            if (state.project) {
-                html += chevronSep + `<span class="breadcrumb-item ${!state.stage ? 'active' : ''}" onclick="selectProject('${state.project.id}')">${state.project.project_name}</span>`;
-            }
-            if (state.stage) {
-                html += chevronSep + `<span class="breadcrumb-item ${!state.version ? 'active' : ''}" onclick="selectStage('${state.stage}')">${state.stage}</span>`;
-            }
-            if (state.version) {
-                html += chevronSep + `<span class="breadcrumb-item active">${state.version.commit_id.substring(0, 8)}</span>`;
-            }
-            $breadcrumb.innerHTML = html;
-        }
-
-        // ============ Level 1: Projects ============
-        async function loadProjects() {
-            state = { project: null, stage: null, version: null, versionsCache: [] };
-            updateBreadcrumb();
-            renderLoading();
-            try {
-                const projects = await api('/projects?limit=500');
-                if (!projects.length) return renderEmpty('暂无项目');
-                let html = '<div class="item-list"><div class="section-header">项目列表</div>';
-                projects.forEach((p, i) => {
-                    html += `<div class="item" style="${staggerDelay(i)}" onclick="selectProject('${p.id}')">
-            <div class="item-icon project">${SVG.project}</div>
-            <div class="item-body">
-              <div class="item-name">${esc(p.project_name)}</div>
-              ${p.description ? `<div class="item-desc">${esc(p.description)}</div>` : ''}
-            </div>
-            <div class="item-meta"><div class="meta-main">${relativeTime(p.created_at)}</div><div>${formatTime(p.created_at)}</div></div>
-            <div class="item-arrow">${SVG.chevron}</div>
-          </div>`;
-                });
-                html += '</div>';
-                $content.innerHTML = html;
-            } catch (e) { renderError('加载项目失败: ' + e.message); }
-        }
-
-        // ============ Level 2: Stages ============
-        async function selectProject(projectId) {
-            renderLoading();
-            try {
-                const projects = await api('/projects?limit=500');
-                state.project = projects.find(p => p.id === projectId);
-                state.stage = null; state.version = null;
-                const versions = await api(`/projects/${projectId}/versions?limit=5000`);
-                state.versionsCache = versions;
-                updateBreadcrumb();
-
-                // Extract unique stages with stats
-                const stageMap = {};
-                versions.forEach(v => {
-                    if (!stageMap[v.stage]) stageMap[v.stage] = { count: 0, latest: v.created_at };
-                    stageMap[v.stage].count++;
-                    if (new Date(v.created_at) > new Date(stageMap[v.stage].latest))
-                        stageMap[v.stage].latest = v.created_at;
-                });
-                const stages = Object.entries(stageMap).sort((a, b) => a[0].localeCompare(b[0]));
-
-                if (!stages.length) return renderEmpty('该项目暂无数据');
-                let html = '<div class="item-list"><div class="section-header">数据阶段</div>';
-                stages.forEach(([name, info], i) => {
-                    html += `<div class="item" style="${staggerDelay(i)}" onclick="selectStage('${esc(name)}')">
-            <div class="item-icon stage">${SVG.folder}</div>
-            <div class="item-body">
-              <div class="item-name">${esc(name)}</div>
-              <div class="item-desc"><span class="badge badge-count">${info.count} 次提交</span></div>
-            </div>
-            <div class="item-meta"><div class="meta-main">最近更新</div><div>${relativeTime(info.latest)}</div></div>
-            <div class="item-arrow">${SVG.chevron}</div>
-          </div>`;
-                });
-                html += '</div>';
-                $content.innerHTML = html;
-            } catch (e) { renderError('加载阶段失败: ' + e.message); }
-        }
-
-        // ============ Level 3: Commits/Versions ============
-        function selectStage(stageName) {
-            state.stage = stageName; state.version = null;
-            updateBreadcrumb();
-
-            const versions = state.versionsCache
-                .filter(v => v.stage === stageName)
-                .sort((a, b) => new Date(b.created_at) - new Date(a.created_at));
-
-            if (!versions.length) return renderEmpty('该阶段暂无提交');
-            let html = '<div class="item-list"><div class="section-header">提交记录</div>';
-            versions.forEach((v, i) => {
-                const shortId = v.commit_id.substring(0, 8);
-                html += `<div class="item" style="${staggerDelay(i)}" onclick="selectVersion('${v.id}')">
-          <div class="item-icon commit">${SVG.commit}</div>
-          <div class="item-body">
-            <div class="item-name"><code style="font-size:13px;color:var(--accent-light);background:var(--accent-glow);padding:2px 6px;border-radius:4px;">${shortId}</code></div>
-            <div class="item-desc">${v.author ? '作者: ' + esc(v.author) : ''}</div>
-          </div>
-          <div class="item-meta"><div class="meta-main">${relativeTime(v.created_at)}</div><div>${formatTime(v.created_at)}</div></div>
-          <div class="item-arrow">${SVG.chevron}</div>
-        </div>`;
-            });
-            html += '</div>';
-            $content.innerHTML = html;
-        }
-
-        // ============ Level 4: Files ============
-        async function selectVersion(versionId) {
-            renderLoading();
-            try {
-                const v = state.versionsCache.find(x => x.id === versionId);
-                if (v) state.version = v;
-                updateBreadcrumb();
-
-                const tree = await api(`/versions/${versionId}/files`);
-                if (!tree.length) return renderEmpty('该提交暂无文件');
-
-                let html = '<div class="item-list"><div class="section-header">文件列表</div><div class="file-tree">';
-                html += renderTree(tree, 0);
-                html += '</div></div>';
-                $content.innerHTML = html;
-            } catch (e) { renderError('加载文件失败: ' + e.message); }
-        }
-
-        function renderTree(nodes, depth) {
-            let html = '';
-            let idx = 0;
-            nodes.forEach(node => {
-                const indent = `<span class="indent" style="width:${depth * 24}px;display:inline-block;"></span>`;
-                if (node.type === 'folder') {
-                    const folderId = 'f_' + Math.random().toString(36).substr(2, 8);
-                    html += `<div class="tree-item" style="${staggerDelay(idx++)}" onclick="toggleFolder('${folderId}', this)">
-            ${indent}
-            <span class="tree-folder-toggle" id="toggle_${folderId}">${SVG.chevron}</span>
-            <div class="item-icon folder">${SVG.folder}</div>
-            <div class="item-body"><div class="item-name">${esc(node.name)}</div></div>
-          </div>
-          <div class="tree-children" id="${folderId}">
-            ${renderTree(node.children || [], depth + 1)}
-          </div>`;
-                } else {
-                    html += `<div class="tree-item" style="${staggerDelay(idx++)}">
-            ${indent}
-            <span style="width:16px;display:inline-block;"></span>
-            <div class="item-icon file">${SVG.file}</div>
-            <div class="item-body">
-              <div class="item-name">${esc(node.name)}</div>
-              <div class="item-desc">${formatSize(node.size)}${node.file_type ? ' · ' + node.file_type : ''}</div>
-            </div>
-            <a class="btn-download" href="/files/${node.id}/content" download="${esc(node.name)}" onclick="event.stopPropagation();">
-              ${SVG.download} 下载
-            </a>
-          </div>`;
-                }
-            });
-            return html;
-        }
-
-        function toggleFolder(id, el) {
-            const children = document.getElementById(id);
-            const toggle = document.getElementById('toggle_' + id);
-            if (children) children.classList.toggle('expanded');
-            if (toggle) toggle.classList.toggle('expanded');
-        }
-
-        function esc(s) {
-            if (!s) return '';
-            const d = document.createElement('div');
-            d.textContent = s;
-            return d.innerHTML;
-        }
-
-        // ============ Init ============
-        loadProjects();
-    </script>
-</body>
-
-</html>

+ 1 - 4
manifest.yaml.example

@@ -49,14 +49,11 @@ stages:
         label: 帖子输入         # 指定该数据的业务名称(标签)
 
       # 示例 F:指定文件为“输出”,并且如果是 JSON 文件,可以提取特定 key 的值
-      #          同时指定深度 directory_depth: 2,这样如果文件在 `data/output/foo/bar.json`
-      #          它的 group_key 会被设置成 `data/output` 而不是默认的 `data/output/foo`
       - path: data/output/
         pattern: "*.json"
         direction: output
         label: 灵感点
         extract_json_key: "data.idea_content"  # 会解析 JSON 并提取对应 key 的值保存
-        directory_depth: 2
         
   # ---------- 阶段 5:动态配对输入文件(自动提取) ----------
   - name: auto_paired_data
@@ -96,7 +93,7 @@ stages:
 #     - direction (可选) 该文件的流入/流出方向(如 'input', 'output' 等)
 #     - label     (可选) 该文件的业务称呼/标签(如 '帖子输入', '灵感点' 等)
 #     - extract_json_key (可选) 针对 JSON 文件,配置要提取解析的 json key 路径(支持由于嵌套的 . 分隔,例如 'data.content')。提取的值会被记录在数据库中。
-#     - directory_depth  (可选) 定义这组规则生成的文件关联用的父目录深度(如 1 或 2,用来将不同子目录的关联文件合并到一行展示)。
+
 #     - paired_input     (可选) 动态输入映射规则,用于输出生成后主动拉取关联的输入。包含:
 #       - extract_regex:   提取路径变量的正则表达式 (必需使用命名捕获组,如 (?P<var>...))
 #       - path_template:   组装对应输入文件路径的模板 (如 "aigc_data/{var}/file.json")

+ 0 - 44
migrate_raw.py

@@ -1,44 +0,0 @@
-import pymysql
-import os
-
-def migrate():
-    try:
-        conn = pymysql.connect(
-            host='rm-t4n8oyqunr5b4461s6o.mysql.singapore.rds.aliyuncs.com',
-            port=3306,
-            user='developer_saas',
-            password='developer_saas#Aiddit',
-            db='data_nexus'
-        )
-        with conn.cursor() as cursor:
-            print("Connected to DB")
-            
-            # 1. Add commit_message to data_records
-            try:
-                cursor.execute("ALTER TABLE data_records ADD COLUMN commit_message TEXT DEFAULT NULL;")
-                print("Added commit_message to data_records")
-            except Exception as e:
-                print(f"Skipping data_records.commit_message: {e}")
-
-            # 2. Add commit_message to data_versions
-            try:
-                cursor.execute("ALTER TABLE data_versions ADD COLUMN commit_message TEXT DEFAULT NULL;")
-                print("Added commit_message to data_versions")
-            except Exception as e:
-                print(f"Skipping data_versions.commit_message: {e}")
-
-            # 3. Add content_hash to data_records
-            try:
-                cursor.execute("ALTER TABLE data_records ADD COLUMN content_hash VARCHAR(64) DEFAULT NULL;")
-                print("Added content_hash to data_records")
-            except Exception as e:
-                print(f"Skipping data_records.content_hash: {e}")
-
-        conn.commit()
-        conn.close()
-        print("Migration done")
-    except Exception as e:
-        print(f"Global error: {e}")
-
-if __name__ == "__main__":
-    migrate()

+ 0 - 33
migrate_schema.py

@@ -1,33 +0,0 @@
-from sqlalchemy import text
-from app.database import engine
-
-def run_migrations():
-    with engine.connect() as conn:
-        print("Starting migrations...")
-        
-        # 1. Add commit_message to data_records
-        try:
-            conn.execute(text("ALTER TABLE data_records ADD COLUMN commit_message TEXT DEFAULT NULL;"))
-            print("Added commit_message to data_records")
-        except Exception as e:
-            print(f"Skipping commit_message for data_records: {e}")
-
-        # 2. Add commit_message to data_versions
-        try:
-            conn.execute(text("ALTER TABLE data_versions ADD COLUMN commit_message TEXT DEFAULT NULL;"))
-            print("Added commit_message to data_versions")
-        except Exception as e:
-            print(f"Skipping commit_message for data_versions: {e}")
-
-        # 3. Add content_hash to data_records
-        try:
-            conn.execute(text("ALTER TABLE data_records ADD COLUMN content_hash VARCHAR(64) DEFAULT NULL;"))
-            print("Added content_hash to data_records")
-        except Exception as e:
-            print(f"Skipping content_hash for data_records: {e}")
-        
-        conn.commit()
-        print("Migrations complete.")
-
-if __name__ == "__main__":
-    run_migrations()

+ 0 - 29
reaggregate_records.py

@@ -1,29 +0,0 @@
-from app.database import SessionLocal
-from app.models import DataVersion
-from app.services.storage_service import StorageService
-from app.services.gogs_client import GogsClient
-import logging
-
-# Setup basic logging to see what's happening
-logging.basicConfig(level=logging.INFO)
-
-def reaggregate():
-    db = SessionLocal()
-    gogs = GogsClient()
-    storage = StorageService(db, gogs)
-    
-    versions = db.query(DataVersion).all()
-    print(f"Found {len(versions)} versions to re-aggregate.")
-    
-    for v in versions:
-        print(f"Re-aggregating version {v.id} (Stage: {v.stage}, Commit: {v.commit_id[:8]})")
-        try:
-            storage.aggregate_version_records(v)
-        except Exception as e:
-            print(f"Error re-aggregating version {v.id}: {e}")
-    
-    db.close()
-    print("Done!")
-
-if __name__ == "__main__":
-    reaggregate()

+ 56 - 40
使用指南.md

@@ -1,12 +1,12 @@
-# DataNexus 使用指南
+# 全局中后台-结果上云使用指南
 
 ## 这是什么?
 
-DataNexus 是一个数据自动归集系统。只要你在项目中配置好 `manifest.yaml`,每次 `git push` 后,系统会自动把你指定的文件上传到云端,并保留历史版本。
+是一个数据自动归集系统。只要你在项目中配置好 `manifest.yaml`,每次 `git push` 后,系统会自动把你指定的文件上传到云端,并保留历史版本。
 
 ## 前置准备:仓库权限
 
-DataNexus 需要对你的仓库拥有**管理权限**,才能自动读取配置并设置 Webhook。请根据你的仓库归属情况,确认是否需要额外操作:
+需要对你的仓库拥有**管理权限**,才能自动读取配置并设置 Webhook。请根据你的仓库归属情况,确认是否需要额外操作:
 
 ### 情况一:仓库属于 AIGC 或 Server 组织 ✅ 无需操作
 
@@ -22,7 +22,7 @@ DataNexus 需要对你的仓库拥有**管理权限**,才能自动读取配置
 4. 权限选择 **管理(Admin)**
 5. 点击确认
 
-> 💡 **为什么需要管理权限?** DataNexus 需要 Admin 权限来为仓库自动配置 Webhook,这是触发数据自动归集的前提。添加后系统会在下次扫描时自动完成 Webhook 配置,你无需其他手动操作。
+> 💡 **为什么需要管理权限?** 需要 Admin 权限来为仓库自动配置 Webhook,这是触发数据自动归集的前提。添加后系统会在下次扫描时自动完成 Webhook 配置,你无需其他手动操作。
 
 > 💡 **推荐做法:** 如果没有特殊原因,建议将数据相关的仓库创建在 **AIGC** 或 **Server** 组织下,这样可以跳过授权步骤,开箱即用。
 
@@ -61,35 +61,58 @@ git push
 project_name: "topic_research"    # 项目唯一标识(必填)
 
 stages:
-  - name: "selection"             # 环节名称(必填)
+  - name: "what选题"             # 环节(业务文件)名称(必填)
     outputs:
-      - path: "./results/report.csv"           # 上传单个文件
-      - path: "./output_images/"               # 上传整个目录
+      - path: "./aiddit/decode/topic/result/topic.json"           # 上传单个文件
+      - path: "./aiddit/decode/topic/result/"               # 上传整个目录
 ```
 
-### 完整示例(多环节)
+### 多环节(多业务文件与过滤示例
 
 ```yaml
 project_name: "topic_research"
 
 stages:
-  # 环节1:选题
-  - name: "selection"
+  # 环节(业务文件)1:制作表
+  - name: "制作表"
     outputs:
       - path: "./results/daily_report.csv"
       - path: "./output_images/"
         pattern: "*.png"                       # 只上传 png 文件
 
-  # 环节2:数据清洗
-  - name: "cleaning"
+  # 环节(业务文件)2:创作表
+  - name: "创作表"
     outputs:
       - path: "./cleaned_data/"
-        pattern: "*.csv"
+        pattern: ["*.csv", "*.xlsx"]           # 包含 csv 和 xlsx
+        exclude: ["temp_*", "*.tmp"]           # 排除临时文件
+```
+
+### 高阶特性示例 (元数据提取与动态关联)
+
+```yaml
+project_name: "what-创作"
 
-  # 环节3:分析报告
-  - name: "analysis"
+stages:
+  # 根据输出反向自动拉取输入 (动态正则关联)
+  - name: "创作表"
     outputs:
-      - path: "./reports/"
+      - path: aiddit/decode/topic/result/
+        pattern: "*.json"
+        direction: output
+        label: "标准化结果"
+        # 使用 paired_inputs (加 's' 并使用列表) 支持提取多个文件
+        paired_inputs:
+          # 提取原始数据1
+          - extract_regex: "aiddit/decode/topic/result/(?P<name>[^/]+)/final/(?P<filename>[^/]+)"
+            path_template: "aigc_data/{name}/{filename}"
+            direction: input
+            label: "原始帖子数据"
+          # 提取原始数据2 (可配置多个)
+          - extract_regex: "aiddit/decode/topic/result/(?P<name>[^/]+)/final/(?P<filename>[^/]+)"
+            path_template: "other_data/{name}/config.json"
+            direction: input
+            label: "附带配置组"
 ```
 
 ### 配置说明
@@ -98,30 +121,23 @@ stages:
 |------|------|------|
 | `project_name` | ✅ | 项目唯一标识,建议用英文 |
 | `stages` | ✅ | 环节列表 |
-| `stages[].name` | ✅ | 环节名称,如 selection、cleaning、analysis |
-| `stages[].outputs` | ✅ | 要上传的文件/目录列表 |
-| `outputs[].path` | ✅ | 文件或目录路径(相对于项目根目录) |
-| `outputs[].pattern` | ❌ | 文件包含模式,默认 `*`。支持单个字符串或列表 |
-| `outputs[].exclude` | ❌ | 文件排除模式,默认无。支持单个字符串或列表 |
-
-### path 写法
-
-```yaml
-# 单个文件
-- path: "./data/result.csv"
-
-# 整个目录(注意结尾的 /)
-- path: "./output/"
-
-# 带匹配模式的目录
-- path: "./images/"
-  pattern: "*.png"          # 只匹配 png 文件
-
-# 多模式与排除(升级功能)
-- path: "./data/"
-  pattern: ["*.csv", "*.xlsx"]  # 包含 csv 和 xlsx
-  exclude: ["temp_*", "*.tmp"]  # 排除临时文件
-```
+| `stages[].name` | ✅ | 环节名称,如 raw、cleaned、output |
+| `stages[].outputs` | ✅ | 要提取的文件/目录列表 |
+| `outputs[].path` | ✅ | 文件或目录路径(相对于项目根目录,以 `/` 结尾表示目录) |
+| `outputs[].pattern` | ❌ | 文件包含模式,支持单个或列表(如 `*.csv`) |
+| `outputs[].exclude` | ❌ | 文件排除模式,支持单个或列表(如 `*.tmp`) |
+| `outputs[].direction` | ❌ | 数据流向标记(如 `input`, `output`) |
+| `outputs[].label` | ❌ | 数据的业务标签(如 `帖子输入`, `报告`) |
+| `outputs[].extract_json_key`| ❌ | 针对 JSON 文件,提取解析指定 key 路径(支持 `.` 嵌套)的值并保存 |
+
+| `outputs[].paired_input` | ❌ | 动态输入映射规则(单配置)。提取路径参数并自动关联输入文件 |
+| `outputs[].paired_inputs` | ❌ | 动态输入映射规则列表(多配置)。和 `paired_input` 作用相同,但可以同时提取多个不同的反拉规则 |
+
+### path 写法提示
+
+- **单个文件**:`- path: "./data/result.csv"`
+- **整个目录**:`- path: "./output/"`(注意结尾的 `/`)
+- **混合模式**:不仅可以指定包含和排除,还可以通过 `pattern` 列表实现多特征匹配。
 
 ---