4 Revize 83c73ee3cd ... da6667cd64

Autor SHA1 Zpráva Datum
  tanjingyu da6667cd64 fix:records.html před 2 měsíci
  tanjingyu 5ec7232cba fix:asyncio.gather分支 před 2 měsíci
  tanjingyu 6afa58b3e6 feat:records.html页面、支持directory_depth、label的manifest.yaml před 2 měsíci
  tanjingyu 8809daff3c feat:records.html页面、支持directory_depth、label的manifest.yaml před 2 měsíci

+ 32 - 1
app/main.py

@@ -56,7 +56,11 @@ def build_file_tree(files: List[DataFile]) -> list:
                     "id": f.id,
                     "size": f.file_size,
                     "file_type": f.file_type,
-                    "sha": f.file_sha
+                    "sha": f.file_sha,
+                    "direction": f.direction,
+                    "label": f.label,
+                    "extracted_value": f.extracted_value,
+                    "group_key": f.group_key
                 })
             else:
                 # It's a folder
@@ -93,6 +97,12 @@ def filesystem_page():
     return FileResponse(os.path.join(STATIC_DIR, "index.html"), media_type="text/html")
 
 
+@app.get("/records")
+def records_page():
+    """Serve the data records UI."""
+    return FileResponse(os.path.join(STATIC_DIR, "records.html"), media_type="text/html")
+
+
 @app.get("/api/health")
 def health_check():
     """Health check endpoint."""
@@ -226,11 +236,32 @@ def get_stage_files(
                 "file_size": f.file_size,
                 "file_type": f.file_type,
                 "file_sha": f.file_sha,
+                "direction": f.direction,
+                "label": f.label,
+                "extracted_value": f.extracted_value,
+                "group_key": f.group_key,
             } for f in files]
         })
     return result
 
 
+@app.get("/projects/{project_id}/records", response_model=List[schemas.DataRecordOut])
+def list_data_records(
+    project_id: str,
+    stage: Optional[str] = None,
+    skip: int = 0,
+    limit: int = 100,
+    db: Session = Depends(get_db)
+):
+    """List data records for a project, optionally filtered by stage."""
+    from app.models import DataRecord
+    query = db.query(DataRecord).filter(DataRecord.project_id == project_id)
+    if stage:
+        query = query.filter(DataRecord.stage == stage)
+    records = query.order_by(DataRecord.created_at.desc()).offset(skip).limit(limit).all()
+    return records
+
+
 # ==================== Version APIs ====================
 
 @app.get("/projects/{project_id}/versions", response_model=List[schemas.DataVersionOut])

+ 28 - 1
app/models.py

@@ -1,4 +1,4 @@
-from sqlalchemy import Column, Integer, String, Text, ForeignKey, DateTime, BigInteger, UniqueConstraint
+from sqlalchemy import Column, Integer, String, Text, ForeignKey, DateTime, BigInteger, UniqueConstraint, JSON
 from sqlalchemy.orm import relationship
 from sqlalchemy.sql import func
 from ulid import ULID
@@ -29,6 +29,7 @@ class DataVersion(Base):
     stage = Column(String(200), nullable=False)
     commit_id = Column(String(64), nullable=False)
     author = Column(String(50))
+    commit_message = Column(Text)
     manifest_snapshot = Column(Text)
     created_at = Column(DateTime(timezone=True), server_default=func.now())
 
@@ -50,6 +51,32 @@ class DataFile(Base):
     file_size = Column(BigInteger)
     file_type = Column(String(20))
     file_sha = Column(String(64), index=True)  # Git Blob SHA for deduplication
+    direction = Column(String(20), nullable=True)  # e.g., 'input' or 'output'
+    label = Column(String(100), nullable=True)     # e.g., '帖子输入'
+    extracted_value = Column(Text, nullable=True)  # extracted JSON value
+    group_key = Column(String(255), nullable=True) # Used to group related inputs and outputs
     created_at = Column(DateTime(timezone=True), server_default=func.now())
 
     version = relationship("DataVersion", back_populates="files")
+
+class DataRecord(Base):
+    __tablename__ = "data_records"
+
+    id = Column(String(26), primary_key=True, default=generate_ulid)
+    project_id = Column(String(26), ForeignKey("projects.id"))
+    version_id = Column(String(26), ForeignKey("data_versions.id"))
+    stage = Column(String(200), index=True)
+    commit_id = Column(String(64))
+    commit_message = Column(Text)
+    group_key = Column(String(255))
+    
+    inputs = Column(JSON)
+    outputs = Column(JSON)
+    
+    content_hash = Column(String(64))
+    
+    author = Column(String(50))
+    created_at = Column(DateTime(timezone=True), server_default=func.now())
+
+    version = relationship("DataVersion")
+    project = relationship("Project")

+ 26 - 0
app/schemas.py

@@ -29,6 +29,10 @@ class DataFileBase(BaseModel):
     file_size: int
     file_type: str
     file_sha: str
+    direction: Optional[str] = None
+    label: Optional[str] = None
+    extracted_value: Optional[str] = None
+    group_key: Optional[str] = None
 
 
 class DataFileOut(DataFileBase):
@@ -48,6 +52,7 @@ class DataVersionBase(BaseModel):
     stage: str
     commit_id: str
     author: Optional[str] = None
+    commit_message: Optional[str] = None
     manifest_snapshot: Optional[str] = None
 
 
@@ -66,3 +71,24 @@ class DataVersionWithFiles(DataVersionOut):
 
 # Keep old name for backward compatibility
 DataVersion = DataVersionOut
+
+
+class DataRecordBase(BaseModel):
+    stage: str
+    commit_id: str
+    commit_message: Optional[str] = None
+    group_key: Optional[str] = None
+    content_hash: Optional[str] = None
+    inputs: Optional[List] = []
+    outputs: Optional[List] = []
+    author: Optional[str] = None
+
+
+class DataRecordOut(DataRecordBase):
+    id: str
+    project_id: str
+    version_id: str
+    created_at: datetime
+
+    class Config:
+        from_attributes = True

+ 17 - 22
app/services/gogs_client.py

@@ -126,30 +126,24 @@ class GogsClient:
             return None
 
     async def get_directory_tree(self, owner: str, repo: str, commit_id: str, dir_path: str) -> list:
-        """Get all files under a specific directory (recursive).
-
-        Args:
-            dir_path: Directory path without trailing slash (e.g., "data/output")
-
-        Returns:
-            List of file info dicts with 'path', 'sha', 'size', 'type'
-        """
+        """Get all files under a specific directory (recursive) using concurrency."""
+        import asyncio
         all_files = []
 
-        async def fetch_contents(path: str):
-            """Recursively fetch directory contents using contents API."""
-            url = f"{self.base_url}/api/v1/repos/{owner}/{repo}/contents/{path}?ref={commit_id}"
-            try:
-                async with httpx.AsyncClient(timeout=_DEFAULT_TIMEOUT) as client:
-                    resp = await client.get(url, headers=self.headers)
+        async with httpx.AsyncClient(timeout=_DEFAULT_TIMEOUT, headers=self.headers) as client:
+            async def fetch_contents(path: str):
+                """Recursively fetch directory contents using contents API in parallel."""
+                url = f"{self.base_url}/api/v1/repos/{owner}/{repo}/contents/{path}?ref={commit_id}"
+                try:
+                    resp = await client.get(url)
                     if resp.status_code == 404:
                         logger.warning(f"Directory not found: {path}")
                         return
                     resp.raise_for_status()
                     data = resp.json()
 
-                    # contents API returns list for directories
                     if isinstance(data, list):
+                        tasks = []
                         for item in data:
                             if item.get("type") == "file":
                                 all_files.append({
@@ -159,13 +153,14 @@ class GogsClient:
                                     "type": "blob"
                                 })
                             elif item.get("type") == "dir":
-                                # Recursively fetch subdirectory
-                                await fetch_contents(item.get("path"))
-
-            except httpx.HTTPStatusError as e:
-                logger.error(f"Failed to get contents for {path}: {e}")
-
-        await fetch_contents(dir_path)
+                                tasks.append(fetch_contents(item.get("path")))
+                        
+                        if tasks:
+                            await asyncio.gather(*tasks)
+                except Exception as e:
+                    logger.error(f"Failed to get contents for {path}: {e}")
+
+            await fetch_contents(dir_path)
         return all_files
 
     async def get_file_content(self, owner: str, repo: str, commit_id: str, file_path: str) -> bytes:

+ 139 - 2
app/services/storage_service.py

@@ -1,11 +1,12 @@
 import os
 from sqlalchemy.orm import Session
 from sqlalchemy.exc import IntegrityError
-from app.models import Project, DataVersion, DataFile
+from app.models import Project, DataVersion, DataFile, DataRecord
 from app.config import settings
 from app.services.gogs_client import GogsClient
 from app.services.oss_client import oss_client
 import logging
+import hashlib
 
 logger = logging.getLogger(__name__)
 
@@ -24,13 +25,14 @@ class StorageService:
             self.db.refresh(project)
         return project
 
-    def create_version(self, project_id: str, stage: str, commit_id: str, author: str, manifest: str) -> DataVersion | None:
+    def create_version(self, project_id: str, stage: str, commit_id: str, author: str, manifest: str, commit_message: str = None) -> DataVersion | None:
         """Create a new data version. Returns None if a duplicate exists (IntegrityError)."""
         version = DataVersion(
             project_id=project_id,
             stage=stage,
             commit_id=commit_id,
             author=author,
+            commit_message=commit_message,
             manifest_snapshot=manifest
         )
         try:
@@ -94,6 +96,65 @@ class StorageService:
 
         return False
 
+    def aggregate_version_records(self, version: DataVersion):
+        """Aggregate files in a version into DataRecord groups based on parent directory."""
+        from collections import defaultdict
+        
+        # 1. Clean existing records for this version (idempotency)
+        self.db.query(DataRecord).filter(DataRecord.version_id == version.id).delete()
+        
+        files = self.db.query(DataFile).filter(DataFile.version_id == version.id).all()
+        
+        # 2. Group by dirname
+        groups = defaultdict(lambda: {"inputs": [], "outputs": []})
+        
+        for f in files:
+            # Group key falls back to immediate parent directory if not explicitly saved in f.group_key
+            group_key = f.group_key if f.group_key is not None else os.path.dirname(f.relative_path)
+            
+            file_data = {
+                "id": f.id,
+                "relative_path": f.relative_path,
+                "file_type": f.file_type,
+                "file_size": f.file_size,
+                "file_sha": f.file_sha,
+                "direction": f.direction,
+                "label": f.label,
+                "extracted_value": f.extracted_value,
+                "storage_path": f.storage_path
+            }
+            if f.direction == "input":
+                groups[group_key]["inputs"].append(file_data)
+            else:
+                # Treat 'output' or None as output by default for rendering purposes
+                groups[group_key]["outputs"].append(file_data)
+        # 3. Insert aggregated records
+        for group_key, data in groups.items():
+            # Calculate a deterministic content_hash for this group of files
+            # Combine all SHA values, sort them to ensure same set of files results in same hash
+            all_shas = [f["file_sha"] for f in data["inputs"]] + [f["file_sha"] for f in data["outputs"]]
+            all_shas.sort()
+            combined_string = "|".join(all_shas)
+            content_hash = hashlib.sha256(combined_string.encode('utf-8')).hexdigest()
+
+            record = DataRecord(
+                project_id=version.project_id,
+                version_id=version.id,
+                stage=version.stage,
+                commit_id=version.commit_id,
+                commit_message=version.commit_message,
+                group_key=group_key,
+                inputs=data["inputs"],
+                outputs=data["outputs"],
+                content_hash=content_hash,
+                author=version.author,
+                # letting server_default handle created_at
+            )
+            self.db.add(record)
+            
+        self.db.commit()
+        logger.info(f"Aggregated version {version.id} into {len(groups)} DataRecord(s).")
+
     async def process_file_with_sha(
         self,
         version: DataVersion,
@@ -101,9 +162,14 @@ class StorageService:
         file_sha: str,
         owner: str,
         repo: str,
+        direction: str = None,
+        label: str = None,
+        extract_json_key: str = None,
+        directory_depth: int = None,
     ) -> bool:
         """Process a file and create a snapshot record.
 
+
         **Snapshot semantics**: a record is ALWAYS created regardless of
         whether the file changed.  This ensures every version is a
         self-contained snapshot of all declared output files.
@@ -127,8 +193,52 @@ class StorageService:
             .first()
         )
 
+        should_extract = bool(extract_json_key and relative_path.lower().endswith(".json"))
+        extracted_val = None
+
+        # Calculate group_key based on directory_depth
+        calc_group_key = os.path.dirname(relative_path)  # Default fallback
+        if directory_depth is not None and directory_depth > 0:
+            parts = relative_path.split("/")
+            # Remove filename
+            if len(parts) > 1:
+                parts = parts[:-1]
+                # Combine up to directory_depth
+                calc_group_key = "/".join(parts[:directory_depth])
+            else:
+                calc_group_key = "" # File is in root directory
+
+        async def _extract_val() -> str | None:
+            try:
+                content_bytes = await self.gogs.get_file_content(owner, repo, version.commit_id, relative_path)
+                if not content_bytes:
+                    return None
+                import json
+                parsed = json.loads(content_bytes.decode('utf-8'))
+                val = parsed
+                for key_part in extract_json_key.split("."):
+                    if isinstance(val, dict):
+                        val = val.get(key_part)
+                    else:
+                        val = None
+                        break
+                if val is not None:
+                    if isinstance(val, (dict, list)):
+                        return json.dumps(val, ensure_ascii=False)
+                    return str(val)
+            except Exception as e:
+                logger.warning(f"Failed to extract json key {extract_json_key} from {relative_path}: {e}")
+            return None
+
         if last_file and last_file.file_sha == file_sha:
             # ── Unchanged: reuse previous OSS key, still record a snapshot entry ──
+            # Optimization: Try to reuse previously extracted value if the SHA hasn't changed
+            if should_extract:
+                if last_file.extracted_value is not None:
+                    extracted_val = last_file.extracted_value
+                else:
+                    extracted_val = await _extract_val()
+            
             new_file = DataFile(
                 version_id=version.id,
                 relative_path=relative_path,
@@ -136,6 +246,10 @@ class StorageService:
                 file_size=last_file.file_size,
                 file_type=last_file.file_type,
                 file_sha=file_sha,
+                direction=direction,
+                label=label,
+                extracted_value=extracted_val,
+                group_key=calc_group_key,
             )
             self.db.add(new_file)
             self.db.commit()
@@ -157,6 +271,25 @@ class StorageService:
 
         oss_client.upload(oss_key, content)
 
+        if should_extract:
+            try:
+                import json
+                parsed = json.loads(content.decode('utf-8'))
+                val = parsed
+                for key_part in extract_json_key.split("."):
+                    if isinstance(val, dict):
+                        val = val.get(key_part)
+                    else:
+                        val = None
+                        break
+                if val is not None:
+                    if isinstance(val, (dict, list)):
+                        extracted_val = json.dumps(val, ensure_ascii=False)
+                    else:
+                        extracted_val = str(val)
+            except Exception as e:
+                logger.warning(f"Failed to extract json key {extract_json_key} from {relative_path}: {e}")
+
         new_file = DataFile(
             version_id=version.id,
             relative_path=relative_path,
@@ -164,6 +297,10 @@ class StorageService:
             file_size=file_size,
             file_type=os.path.splitext(relative_path)[1],
             file_sha=file_sha,
+            direction=direction,
+            label=label,
+            extracted_value=extracted_val,
+            group_key=calc_group_key,
         )
         self.db.add(new_file)
         self.db.commit()

+ 62 - 16
app/services/webhook_service.py

@@ -114,9 +114,15 @@ class WebhookService:
                 logger.info(f"Stage '{stage_name}' already processed. Skipping.")
                 continue
 
+            # Get commit message from payload if available
+            commit_msg = None
+            commits = payload.get("commits", [])
+            if commits:
+                commit_msg = commits[0].get("message")
+
             # Create version for this stage
             version = self.storage.create_version(
-                project.id, stage_name, after_sha, author_name, manifest=manifest_content
+                project.id, stage_name, after_sha, author_name, manifest=manifest_content, commit_message=commit_msg
             )
 
             if not version:
@@ -138,6 +144,8 @@ class WebhookService:
                     f"Stage '{stage_name}': no data changes detected (content and file set same). "
                     f"Version discarded."
                 )
+            else:
+                self.storage.aggregate_version_records(version)
 
     def _get_all_changed_files(self, payload: dict) -> set[str]:
         """Extract all added, modified, and removed files from all commits in payload."""
@@ -162,8 +170,13 @@ class WebhookService:
                 if is_dir:
                     # If it's a directory output, any change inside that directory counts
                     dir_path = path_pattern.rstrip("/")
-                    if f == dir_path or f.startswith(dir_path + "/"):
-                        return True
+                    if '*' in dir_path:
+                        import fnmatch
+                        if fnmatch.fnmatch(f, dir_path + "/*") or fnmatch.fnmatch(f, dir_path):
+                            return True
+                    else:
+                        if f == dir_path or f.startswith(dir_path + "/"):
+                            return True
                 else:
                     # Single file output: exact match
                     if f == path_pattern:
@@ -189,30 +202,61 @@ class WebhookService:
             patterns = output.get("pattern", "*")
             excludes = output.get("exclude")
 
+            direction = output.get("direction")
+            label = output.get("label")
+            extract_json_key = output.get("extract_json_key")
+            directory_depth = output.get("directory_depth")
+
             path_pattern = normalize_path(raw_path_pattern)
             is_dir = is_directory_pattern(raw_path_pattern)
+            dir_path = path_pattern.rstrip("/")
 
             if is_dir:
-                # Directory pattern: fetch only this directory's files
-                dir_path = path_pattern.rstrip("/")
-                logger.info(f"Fetching directory: {dir_path} with patterns: {patterns}, excludes: {excludes}")
+                # Directory pattern: fetch files from the closest static parent directory
+                # For `data/*/test/`, that is `data/`
+                import re
+                
+                # Split by first wildcard chunk path
+                wildcard_idx = dir_path.find('*')
+                if wildcard_idx != -1:
+                    static_base = dir_path[:wildcard_idx]
+                    # Trim back to the nearest directory separator
+                    last_sep = static_base.rfind('/')
+                    if last_sep != -1:
+                        static_base = static_base[:last_sep]
+                    else:
+                        static_base = "" # ROOT
+                else:
+                    static_base = dir_path
+                    
+                static_base = static_base.strip('/')
+                
+                logger.info(f"Fetching directory: {static_base} (to match wildcard path: {dir_path}) with patterns: {patterns}, excludes: {excludes}")
 
-                files = await self.gogs.get_directory_tree(owner, repo_name, commit_id, dir_path)
+                files = await self.gogs.get_directory_tree(owner, repo_name, commit_id, static_base)
 
                 for file_info in files:
                     file_path = file_info.get("path")
-                    # Calculate name relative to the watched directory
-                    # e.g. dir_path="a", file_path="a/b.txt" -> rel_name="b.txt"
-                    rel_name = (
-                        file_path[len(dir_path) + 1 :]
-                        if file_path.startswith(dir_path + "/")
-                        else file_path
-                    )
+                    
+                    # 1. First verify if the full path matches the wildcard directory path provided
+                    if '*' in dir_path:
+                        # e.g dir_path: data/*/test/ -> match: data/*/test/*
+                        if not fnmatch.fnmatch(file_path, dir_path + "/*") and not fnmatch.fnmatch(file_path, dir_path):
+                            continue
+                    else:
+                        if not file_path.startswith(dir_path + "/"):
+                            continue
+                        
+                    # Calculate name relative to the matched base path segment for pattern matching
+                    import os
+                    rel_name = os.path.basename(file_path)
 
                     if self._match_patterns(rel_name, patterns, excludes):
                         try:
                             changed = await self.storage.process_file_with_sha(
-                                version, file_path, file_info.get("sha"), owner, repo_name
+                                version, file_path, file_info.get("sha"), owner, repo_name,
+                                direction=direction, label=label, extract_json_key=extract_json_key,
+                                directory_depth=directory_depth
                             )
                             if changed:
                                 has_changes = True
@@ -230,7 +274,9 @@ class WebhookService:
                     if self._match_patterns(filename, patterns, excludes):
                         try:
                             changed = await self.storage.process_file_with_sha(
-                                version, path_pattern, file_info.get("sha"), owner, repo_name
+                                version, path_pattern, file_info.get("sha"), owner, repo_name,
+                                direction=direction, label=label, extract_json_key=extract_json_key,
+                                directory_depth=directory_depth
                             )
                             if changed:
                                 has_changes = True

+ 82 - 57
app/static/console.html

@@ -580,7 +580,6 @@
 
         .fg-children {
             display: none;
-            background: rgba(0, 0, 0, 0.1);
         }
 
         .fg-children.open {
@@ -826,14 +825,14 @@
             let h = '';
 
             S.versions.forEach((v, i) => {
-                const groups = groupFiles(v.files);
+                const tree = buildFileTree(v.files);
                 h += `<div class="version-card" style="animation-delay:${Math.min(i, 10) * 0.05}s">
             <div class="version-head">
                 <span class="commit-tag">${IC.commit} ${esc(v.commit_id.substring(0, 8))}</span>
                 <span class="v-author">${v.author ? esc(v.author) : ''}</span>
                 <span class="v-time" title="${fmtTime(v.created_at)}">${relTime(v.created_at)}</span>
             </div>
-            <div class="version-files">${renderGroups(groups, v)}</div>
+            <div class="version-files">${renderTree(tree, v, 0)}</div>
         </div>`;
             });
             if (S.hasMore) {
@@ -849,82 +848,102 @@
         }
 
         // ============ File Grouping ============
-        function groupFiles(files) {
-            if (!files || !files.length) return [];
+        function countFiles(node) {
+            let cnt = node.files.length;
+            Object.values(node.dirs).forEach(d => { cnt += countFiles(d); });
+            return cnt;
+        }
 
-            const topLevelGroups = {};
-            const rootFiles = [];
+        function buildFileTree(files) {
+            const root = { dirs: {}, files: [], path: '' };
+            if (!files || !files.length) return root;
 
             files.forEach(f => {
                 const parts = f.relative_path.split('/');
-                if (parts.length === 1) {
-                    rootFiles.push(f);
-                } else {
-                    const topDir = parts[0];
-                    if (!topLevelGroups[topDir]) topLevelGroups[topDir] = [];
-                    topLevelGroups[topDir].push(f);
-                }
-            });
-
-            const result = [];
-            Object.entries(topLevelGroups).forEach(([topDir, fls]) => {
-                let commonParts = fls[0].relative_path.split('/').slice(0, -1);
-                for (let i = 1; i < fls.length; i++) {
-                    const parts = fls[i].relative_path.split('/').slice(0, -1);
-                    let j = 0;
-                    while (j < commonParts.length && j < parts.length && commonParts[j] === parts[j]) {
-                        j++;
+                let cur = root;
+                for (let i = 0; i < parts.length - 1; i++) {
+                    const p = parts[i];
+                    if (!cur.dirs[p]) {
+                        const curPath = cur.path ? cur.path + '/' + p : p;
+                        cur.dirs[p] = { name: p, path: curPath, dirs: {}, files: [] };
                     }
-                    commonParts.length = j;
+                    cur = cur.dirs[p];
                 }
-                const groupName = commonParts.join('/');
-                result.push({ type: 'folder', name: groupName, path: groupName, files: fls });
+                cur.files.push(f);
             });
 
-            rootFiles.forEach(f => {
-                result.push({ type: 'file', file: f });
-            });
+            function compact(node) {
+                const dirKeys = Object.keys(node.dirs);
+                dirKeys.forEach(k => {
+                    compact(node.dirs[k]);
+                });
+
+                Object.keys(node.dirs).forEach(k => {
+                    let child = node.dirs[k];
+                    if (!child) return;
+
+                    let changed = true;
+                    while (changed) {
+                        changed = false;
+
+                        if (Object.keys(child.dirs).length === 1 && child.files.length === 0) {
+                            const onlyChildKey = Object.keys(child.dirs)[0];
+                            const onlyChild = child.dirs[onlyChildKey];
+
+                            child.name = child.name + '/' + onlyChild.name;
+                            child.path = onlyChild.path;
+                            child.dirs = onlyChild.dirs;
+                            child.files = onlyChild.files;
+                            changed = true;
+                        }
+
+                        if (Object.keys(child.dirs).length === 0 && child.files.length === 1) {
+                            node.files.push(child.files[0]);
+                            delete node.dirs[k];
+                            // since child is deleted, break inner loops
+                            changed = false;
+                        }
+                    }
+                });
+            }
+            compact(root);
 
-            result.sort((a, b) => {
-                if (a.type !== b.type) return a.type === 'folder' ? -1 : 1;
-                return (a.name || a.file.name).localeCompare(b.name || b.file.name);
-            });
-            return result;
+            return root;
         }
 
-        function renderGroups(groups, version) {
-            if (!groups.length) return '';
+        function renderTree(node, version, depth) {
             let h = '';
-            groups.forEach(g => {
-                if (g.type === 'folder') {
-                    const gid = 'fg_' + Math.random().toString(36).substr(2, 6);
-                    h += `
-            <div class="fg-header" onclick="toggleFG('${gid}')">
+
+            const dirKeys = Object.keys(node.dirs).sort((a, b) => a.localeCompare(b));
+            dirKeys.forEach(k => {
+                const d = node.dirs[k];
+                const gid = 'fg_' + Math.random().toString(36).substr(2, 6);
+                const fileCount = countFiles(d);
+                const padding = `padding-left: ${20 + depth * 24}px;`;
+
+                h += `
+            <div class="fg-header" style="${padding}" onclick="toggleFG('${gid}')">
                 <div class="fg-name-wrap">
                     <span class="fg-arrow" id="fa_${gid}">${IC.chevron}</span>
                     <span class="fg-icon">${IC.folder}</span>
-                    <span class="fg-name">${esc(g.name)}/</span>
-                    <span class="fg-count">${g.files.length} 个文件</span>
+                    <span class="fg-name" title="${esc(d.path)}">${esc(d.name)}/</span>
+                    <span class="fg-count">${fileCount} 个文件</span>
                 </div>
                 <div></div>
             </div>
             <div class="fg-children" id="${gid}">
-                ${g.files.map(f => fileRow(f, version, true, g.path)).join('')}
+                ${renderTree(d, version, depth + 1)}
             </div>`;
-                } else {
-                    h += fileRow(g.file, version, false, null);
-                }
             });
-            return h;
-        }
 
-        function fileRow(f, version, isChild, groupPath) {
-            const padding = isChild ? 'padding-left: 44px;' : '';
-            let displayName = f.name;
-            if (groupPath && f.relative_path.startsWith(groupPath + '/')) {
-                displayName = f.relative_path.substring(groupPath.length + 1);
-            }
-            return `
+            node.files.sort((a, b) => a.relative_path.localeCompare(b.relative_path)).forEach(f => {
+                let displayName = f.relative_path || f.name;
+                if (node.path && f.relative_path.startsWith(node.path + '/')) {
+                    displayName = f.relative_path.substring(node.path.length + 1);
+                }
+                const padding = `padding-left: ${depth === 0 ? 20 : 44 + (depth - 1) * 24}px;`;
+
+                h += `
     <div class="file-row" style="${padding}">
         <div class="file-name-col" title="${esc(f.relative_path)}">
             <span class="f-icon">${IC.file}</span>
@@ -935,6 +954,12 @@
             <a class="btn-dl" href="/files/${f.id}/content" download="${esc(f.name)}" onclick="event.stopPropagation();">${IC.download}</a>
         </div>
     </div>`;
+            });
+
+            if (depth === 0 && h === '') {
+                return '<div style="padding:14px 20px;font-size:13px;color:var(--text-muted)">暂无文件</div>';
+            }
+            return h;
         }
 
         function toggleFG(id) {

+ 730 - 0
app/static/records.html

@@ -0,0 +1,730 @@
+<!DOCTYPE html>
+<html lang="zh-CN">
+
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>中后台系统</title>
+    <meta name="description" content="Data Nexus 宽表数据视图控制台">
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link
+        href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap"
+        rel="stylesheet">
+    <style>
+        *,
+        *::before,
+        *::after {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+
+        :root {
+            --bg-base: #f4f7f9;
+            --bg-sidebar: #ffffff;
+            --bg-nav: #ffffff;
+            --bg-card: #ffffff;
+            --bg-hover: #f8fafc;
+            --bg-active: #eef6ff;
+            --border: #e2e8f0;
+            --border-dark: #d1d5db;
+            --text-primary: #1e293b;
+            --text-secondary: #475569;
+            --text-muted: #94a3b8;
+            --accent: #2a8bf2;
+            --radius: 4px;
+            --sidebar-w: 240px;
+            --nav-h: 60px;
+        }
+
+        body {
+            font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'PingFang SC', 'Microsoft YaHei', sans-serif;
+            background: var(--bg-base);
+            color: var(--text-primary);
+            height: 100vh;
+            overflow: hidden;
+            line-height: 1.5;
+        }
+
+        /* --- Global Layout --- */
+        .app {
+            display: flex;
+            flex-direction: column;
+            height: 100vh;
+        }
+
+        /* --- Top Navigation --- */
+        .navbar {
+            height: var(--nav-h);
+            background: var(--bg-nav);
+            border-bottom: 2px solid #edeff2;
+            display: flex;
+            align-items: center;
+            padding: 0 40px;
+            z-index: 100;
+            flex-shrink: 0;
+        }
+
+        .nav-logo {
+            font-size: 20px;
+            font-weight: 700;
+            color: #2c3e50;
+            margin-right: 60px;
+            white-space: nowrap;
+            letter-spacing: 0.5px;
+        }
+
+        .nav-menu {
+            display: flex;
+            height: 100%;
+            align-items: center;
+        }
+
+        .nav-item {
+            font-size: 15px;
+            color: var(--accent);
+            font-weight: 600;
+            height: 100%;
+            display: flex;
+            align-items: center;
+            padding: 0 4px;
+            position: relative;
+        }
+
+        .nav-item::after {
+            content: '';
+            position: absolute;
+            bottom: -2px;
+            left: -4px;
+            right: -4px;
+            height: 3px;
+            background: var(--accent);
+        }
+
+        /* --- Main Content Layout --- */
+        .main-container {
+            flex: 1;
+            display: grid;
+            grid-template-columns: var(--sidebar-w) 1fr;
+            overflow: hidden;
+        }
+
+        /* --- Sidebar --- */
+        .sidebar {
+            background: var(--bg-sidebar);
+            border-right: 1px solid var(--border);
+            overflow-y: auto;
+            padding: 12px;
+        }
+
+        .sidebar-group {
+            margin-bottom: 8px;
+            border: 1px solid #edeff2;
+            border-radius: 4px;
+            overflow: hidden;
+        }
+
+        .sidebar-tag {
+            font-size: 20px;
+            font-weight: 700;
+            padding: 8px 16px;
+            text-transform: lowercase;
+            width: 100%;
+            border-bottom: 1px solid #edeff2;
+        }
+
+        .tag-how {
+            background: #dcfce7;
+            color: #166534;
+        }
+
+        .tag-what {
+            background: #fef3c7;
+            color: #92400e;
+        }
+
+        .sidebar-list {
+            list-style: none;
+        }
+
+        .sidebar-item {
+            padding: 10px 16px;
+            font-size: 14px;
+            color: #4b5563;
+            cursor: pointer;
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            transition: all 0.2s;
+            border-bottom: 1px solid #f1f5f9;
+        }
+
+        .sidebar-item:last-child {
+            border-bottom: none;
+        }
+
+        .sidebar-item:hover {
+            background: var(--bg-hover);
+            color: var(--text-primary);
+        }
+
+        .sidebar-item.active {
+            background: #ffffff;
+            color: var(--accent);
+            font-weight: 600;
+            box-shadow: inset 4px 0 0 var(--accent);
+        }
+
+        /* --- Content Area --- */
+        .content {
+            display: flex;
+            flex-direction: column;
+            overflow: hidden;
+        }
+
+        .table-container {
+            flex: 1;
+            overflow: auto;
+            padding: 24px;
+        }
+
+        .records-table {
+            width: 100%;
+            border-collapse: collapse;
+            background: #ffffff;
+            box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05);
+            min-width: 1000px;
+        }
+
+        .records-table thead th {
+            background: #eff2f5;
+            color: #1e293b;
+            font-size: 14px;
+            font-weight: 600;
+            text-align: left;
+            padding: 12px 16px;
+            border: 1px solid var(--border-dark);
+        }
+
+        .records-table td {
+            padding: 12px 16px;
+            border: 1px solid var(--border-dark);
+            vertical-align: middle;
+            font-size: 14px;
+            color: #334155;
+        }
+
+        .records-table tr:hover td {
+            background: #fcfdfe;
+        }
+
+        .id-col {
+            width: 120px;
+            font-family: monospace;
+            color: var(--text-muted);
+        }
+
+        .msg-col {
+            width: 300px;
+            font-weight: 500;
+        }
+
+        /* Bubble Tree Hierarchical Styles */
+        .bubble-tree {
+            background: #ffffff;
+            border: 1px solid var(--border-dark);
+            border-radius: 4px;
+            padding: 4px;
+            min-height: 48px;
+            min-width: 200px;
+        }
+
+        .fg-header {
+            display: flex;
+            align-items: center;
+            gap: 4px;
+            padding: 4px 8px;
+            cursor: pointer;
+            transition: background 0.1s;
+            user-select: none;
+            border-radius: 4px;
+        }
+
+        .fg-header:hover {
+            background: var(--bg-hover);
+        }
+
+        .fg-name-wrap {
+            display: flex;
+            align-items: center;
+            gap: 6px;
+            min-width: 0;
+            flex: 1;
+        }
+
+        .fg-arrow {
+            width: 14px;
+            height: 14px;
+            color: var(--text-muted);
+            transition: transform 0.2s;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+        }
+
+        .fg-arrow.open {
+            transform: rotate(90deg);
+        }
+
+        .fg-icon {
+            width: 16px;
+            height: 16px;
+            color: #f6ad55;
+            display: flex;
+            align-items: center;
+        }
+
+        .fg-name {
+            font-size: 13px;
+            color: var(--text-primary);
+            font-weight: 500;
+            white-space: nowrap;
+            overflow: hidden;
+            text-overflow: ellipsis;
+        }
+
+        .fg-count {
+            font-size: 11px;
+            color: var(--text-muted);
+            background: #f1f5f9;
+            padding: 1px 5px;
+            border-radius: 3px;
+        }
+
+        .fg-children {
+            display: none;
+        }
+
+        .fg-children.open {
+            display: block;
+        }
+
+        .file-row {
+            padding: 4px 8px;
+            display: flex;
+            align-items: center;
+            gap: 8px;
+            transition: background 0.1s;
+            border-radius: 4px;
+        }
+
+        .file-row:hover {
+            background: var(--bg-hover);
+        }
+
+        .f-icon {
+            width: 16px;
+            height: 16px;
+            color: var(--text-muted);
+            display: flex;
+            align-items: center;
+        }
+
+        .f-info {
+            flex: 1;
+            min-width: 0;
+            display: flex;
+            flex-direction: column;
+        }
+
+        .f-name-line {
+            display: flex;
+            align-items: center;
+            gap: 6px;
+        }
+
+        .f-name {
+            font-size: 13px;
+            color: var(--text-primary);
+            white-space: nowrap;
+            overflow: hidden;
+            text-overflow: ellipsis;
+        }
+
+        .f-extracted {
+            font-size: 11px;
+            color: #3b82f6;
+            word-break: break-all;
+            margin-top: 1px;
+        }
+
+        .f-size {
+            font-size: 11px;
+            color: var(--text-muted);
+            margin-left: auto;
+            padding-left: 8px;
+        }
+
+        .btn-dl {
+            color: var(--accent);
+            text-decoration: none;
+            display: flex;
+            align-items: center;
+            opacity: 0.6;
+            transition: all 0.2s;
+            margin-left: 4px;
+        }
+
+        .btn-dl:hover {
+            opacity: 1;
+        }
+
+        .btn-dl svg {
+            width: 14px;
+            height: 14px;
+        }
+
+        .col-badge {
+            display: inline-block;
+            padding: 1px 6px;
+            border-radius: 3px;
+            font-size: 10px;
+            font-weight: 700;
+            margin-bottom: 4px;
+            text-transform: uppercase;
+            width: fit-content;
+        }
+
+        .badge-in {
+            background: #eff6ff;
+            color: #3b82f6;
+            border: 1px solid #dbeafe;
+        }
+
+        .badge-out {
+            background: #f0fdf4;
+            color: #22c55e;
+            border: 1px solid #dcfce7;
+        }
+
+        .th-cell {
+            display: flex;
+            flex-direction: column;
+            justify-content: flex-start;
+        }
+
+        .th-label {
+            font-size: 13px;
+        }
+
+        .state-box {
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+            justify-content: center;
+            height: 100%;
+            color: var(--text-muted);
+        }
+
+        .spinner {
+            width: 32px;
+            height: 32px;
+            border: 3px solid #e2e8f0;
+            border-top-color: var(--accent);
+            border-radius: 50%;
+            animation: spin 0.8s linear infinite;
+            margin-bottom: 16px;
+        }
+
+        @keyframes spin {
+            to {
+                transform: rotate(360deg);
+            }
+        }
+
+        ::-webkit-scrollbar {
+            width: 6px;
+            height: 6px;
+        }
+
+        ::-webkit-scrollbar-track {
+            background: transparent;
+        }
+
+        ::-webkit-scrollbar-thumb {
+            background: #cbd5e1;
+            border-radius: 3px;
+        }
+    </style>
+</head>
+
+<body>
+    <div class="app">
+        <header class="navbar">
+            <div class="nav-logo">中后台系统</div>
+            <nav class="nav-menu">
+                <div class="nav-item">解构</div>
+            </nav>
+        </header>
+
+        <div class="main-container">
+            <aside class="sidebar">
+                <div id="sidebarContent">
+                    <div class="sidebar-group">
+                        <div class="sidebar-tag tag-how">how</div>
+                        <ul class="sidebar-list" id="howList"></ul>
+                    </div>
+                    <div class="sidebar-group">
+                        <div class="sidebar-tag tag-what">what</div>
+                        <ul class="sidebar-list" id="whatList"></ul>
+                    </div>
+                </div>
+            </aside>
+
+            <main class="content">
+                <div class="table-container" id="contentBody">
+                    <div class="state-box">
+                        <div class="spinner"></div>
+                        <p>加载中...</p>
+                    </div>
+                </div>
+            </main>
+        </div>
+    </div>
+
+    <script>
+        const IC = {
+            file: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M13 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V9z"></path><polyline points="13 2 13 9 20 9"></polyline></svg>',
+            folder: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M22 19a2 2 0 0 1-2 2H4a2 2 0 0 1-2-2V5a2 2 0 0 1 2-2h5l2 3h9a2 2 0 0 1 2 2z"></path></svg>',
+            download: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"></path><polyline points="7 10 12 15 17 10"></polyline><line x1="12" y1="15" x2="12" y2="3"></line></svg>',
+            chevron: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><polyline points="9 18 15 12 9 6"></polyline></svg>',
+        };
+
+        const PAGE_SIZE = 20;
+        let S = { stages: [], stageProjectMap: {}, stage: null, records: [], skip: 0, hasMore: true, loading: false };
+
+        const $ = id => document.getElementById(id);
+        function esc(s) { if (!s) return ''; const d = document.createElement('div'); d.textContent = s; return d.innerHTML; }
+
+        function fmtSize(b) {
+            if (!b && b !== 0) return '';
+            const u = ['B', 'KB', 'MB', 'GB']; let i = 0, s = b;
+            while (s >= 1024 && i < u.length - 1) { s /= 1024; i++; }
+            return s.toFixed(i > 0 ? 1 : 0) + ' ' + u[i];
+        }
+
+        async function api(url) {
+            try {
+                const r = await fetch(url);
+                if (!r.ok) {
+                    const text = await r.text();
+                    throw new Error(`HTTP ${r.status}: ${text || r.statusText}`);
+                }
+                return r.json();
+            } catch (e) {
+                console.error('API Error:', e);
+                throw e;
+            }
+        }
+
+        async function init() {
+            try {
+                S.stages = await api('/stages/all');
+                S.stages.forEach(st => { S.stageProjectMap[st.name] = st.project_id; });
+                renderSidebar();
+                const first = document.querySelector('.sidebar-item');
+                if (first) first.click();
+            } catch (e) {
+                $('contentBody').innerHTML = `<div class="state-box"><p style="color:red">初始化失败: ${esc(e.message)}</p></div>`;
+            }
+        }
+
+        function renderSidebar() {
+            const howL = $('howList'), whatL = $('whatList');
+            howL.innerHTML = ''; whatL.innerHTML = '';
+            S.stages.forEach(st => {
+                const li = document.createElement('li');
+                li.className = 'sidebar-item';
+                li.innerHTML = `${esc(st.name.split('/').pop())}`;
+                li.onclick = () => selectStage(li, st.name);
+                (st.name.toLowerCase().includes('how') || st.name.toLowerCase().includes('test') ? howL : whatL).appendChild(li);
+            });
+        }
+
+        async function selectStage(el, stageName) {
+            document.querySelectorAll('.sidebar-item').forEach(i => i.classList.remove('active'));
+            el.classList.add('active');
+            S.stage = stageName; S.records = []; S.skip = 0;
+            loadRecords();
+        }
+
+        async function loadRecords() {
+            if (S.loading) return;
+            S.loading = true;
+            $('contentBody').innerHTML = '<div class="state-box"><div class="spinner"></div><p>读取流水线数据...</p></div>';
+            try {
+                const pid = S.stageProjectMap[S.stage];
+                if (!pid) throw new Error('项目 ID 不存在');
+
+                const url = `/projects/${pid}/records?stage=${encodeURIComponent(S.stage)}&skip=0&limit=${PAGE_SIZE}`;
+                const data = await api(url);
+                S.records = Array.isArray(data) ? data : [];
+                renderTable();
+            } catch (e) {
+                console.error('Load Records Error:', e);
+                $('contentBody').innerHTML = `<div class="state-box"><p style="color:red">加载失败: ${esc(e.message)}</p></div>`;
+            }
+            S.loading = false;
+        }
+
+        function renderTable() {
+            if (!S.records || !S.records.length) {
+                $('contentBody').innerHTML = '<div class="state-box"><h2>暂无记录</h2></div>';
+                return;
+            }
+
+            const inLabels = new Set(), outLabels = new Set();
+            S.records.forEach(r => {
+                if (r.inputs) r.inputs.forEach(f => inLabels.add(f.label || '输入'));
+                if (r.outputs) r.outputs.forEach(f => outLabels.add(f.label || '输出'));
+            });
+
+            const sortedIn = Array.from(inLabels).sort();
+            const sortedOut = Array.from(outLabels).sort();
+
+            let h = `<table class="records-table">
+                <thead>
+                    <tr>
+                        <th class="id-col">commit id</th>
+                        <th class="msg-col">commit message</th>`;
+
+            sortedIn.forEach(l => h += `<th><div class="th-cell"><span class="col-badge badge-in">输入</span><span class="th-label">${esc(l)}</span></div></th>`);
+            sortedOut.forEach(l => h += `<th><div class="th-cell"><span class="col-badge badge-out">输出</span><span class="th-label">${esc(l)}</span></div></th>`);
+            h += `</tr></thead><tbody>`;
+
+            S.records.forEach(r => {
+                h += `<tr>
+                    <td class="id-col">${esc(r.commit_id.substring(0, 8))}</td>
+                    <td class="msg-col">${esc(r.commit_message || '无描述')}</td>`;
+
+                sortedIn.forEach(l => {
+                    const files = (r.inputs || []).filter(f => (f.label || '输入') === l);
+                    h += `<td>${renderFiles(files)}</td>`;
+                });
+                sortedOut.forEach(l => {
+                    const files = (r.outputs || []).filter(f => (f.label || '输出') === l);
+                    h += `<td>${renderFiles(files)}</td>`;
+                });
+                h += `</tr>`;
+            });
+
+            h += `</tbody></table>`;
+            $('contentBody').innerHTML = h;
+        }
+
+        function buildFileTree(files) {
+            const root = { dirs: {}, files: [], path: '' };
+            if (!files || !files.length) return root;
+
+            files.forEach(f => {
+                const parts = (f.relative_path || '').split('/');
+                let cur = root;
+                for (let i = 0; i < parts.length - 1; i++) {
+                    const p = parts[i];
+                    if (!cur.dirs[p]) {
+                        const curPath = cur.path ? cur.path + '/' + p : p;
+                        cur.dirs[p] = { name: p, path: curPath, dirs: {}, files: [] };
+                    }
+                    cur = cur.dirs[p];
+                }
+                cur.files.push(f);
+            });
+
+            function compact(node) {
+                Object.keys(node.dirs).forEach(k => compact(node.dirs[k]));
+                Object.keys(node.dirs).forEach(k => {
+                    let child = node.dirs[k];
+                    if (!child) return;
+                    let changed = true;
+                    while (changed) {
+                        changed = false;
+                        if (Object.keys(child.dirs).length === 1 && child.files.length === 0) {
+                            const onlyChildKey = Object.keys(child.dirs)[0];
+                            const onlyChild = child.dirs[onlyChildKey];
+                            child.name = child.name + '/' + onlyChild.name;
+                            child.path = onlyChild.path;
+                            child.dirs = onlyChild.dirs;
+                            child.files = onlyChild.files;
+                            changed = true;
+                        }
+                    }
+                });
+            }
+            compact(root);
+            return root;
+        }
+
+        function countFiles(node) {
+            let cnt = node.files.length;
+            Object.values(node.dirs).forEach(d => { cnt += countFiles(d); });
+            return cnt;
+        }
+
+        function renderTree(node, depth) {
+            let h = '';
+            const dirKeys = Object.keys(node.dirs).sort((a, b) => a.localeCompare(b));
+            dirKeys.forEach(k => {
+                const d = node.dirs[k];
+                const gid = 'fg_' + Math.random().toString(36).substr(2, 6);
+                const fileCount = countFiles(d);
+                const padding = `padding-left: ${depth * 16}px;`;
+                h += `
+                    <div class="fg-header" style="${padding}" onclick="toggleFG('${gid}')">
+                        <div class="fg-name-wrap">
+                            <span class="fg-arrow" id="fa_${gid}">${IC.chevron}</span>
+                            <span class="fg-icon">${IC.folder}</span>
+                            <span class="fg-name" title="${esc(d.path)}">${esc(d.name)}/</span>
+                            <span class="fg-count">${fileCount}</span>
+                        </div>
+                    </div>
+                    <div class="fg-children" id="${gid}">
+                        ${renderTree(d, depth + 1)}
+                    </div>`;
+            });
+
+            node.files.sort((a, b) => (a.relative_path || '').localeCompare(b.relative_path || '')).forEach(f => {
+                const name = f.relative_path ? f.relative_path.split('/').pop() : '未知文件';
+                const padding = `padding-left: ${(depth === 0 ? 8 : 24 + (depth - 1) * 16)}px;`;
+                h += `
+                    <div class="file-row" style="${padding}">
+                        <span class="f-icon">${IC.file}</span>
+                        <div class="f-info">
+                            <div class="f-name-line">
+                                <span class="f-name" title="${esc(f.relative_path)}">${esc(name)}</span>
+                                <span class="f-size">${fmtSize(f.file_size)}</span>
+                                <a class="btn-dl" href="/files/${f.id}/content" download title="下载">${IC.download}</a>
+                            </div>
+                            ${f.extracted_value ? `<div class="f-extracted">↳ ${esc(f.extracted_value)}</div>` : ''}
+                        </div>
+                    </div>`;
+            });
+            return h;
+        }
+
+        function toggleFG(id) {
+            const ch = $(id), ar = $('fa_' + id);
+            if (ch) ch.classList.toggle('open');
+            if (ar) ar.classList.toggle('open');
+        }
+
+        function renderFiles(files) {
+            if (!files || !files.length) return '-';
+            const tree = buildFileTree(files);
+            return `<div class="bubble-tree">${renderTree(tree, 0)}</div>`;
+        }
+
+        init();
+    </script>
+</body>
+
+</html>

+ 21 - 0
manifest.yaml.example

@@ -40,6 +40,23 @@ stages:
       # 示例 D:也可以指定单个文件
       - path: final_report.docx
 
+  # ---------- 阶段 4:带有元数据与 JSON 值提取的数据 ----------
+  - name: enhanced_data
+    outputs:
+      # 示例 E:指定文件为“输入”,并打上标签
+      - path: data/input/article.md
+        direction: input        # 指定是 input 还是 output
+        label: 帖子输入         # 指定该数据的业务名称(标签)
+
+      # 示例 F:指定文件为“输出”,并且如果是 JSON 文件,可以提取特定 key 的值
+      #          同时指定深度 directory_depth: 2,这样如果文件在 `data/output/foo/bar.json`
+      #          它的 group_key 会被设置成 `data/output` 而不是默认的 `data/output/foo`
+      - path: data/output/
+        pattern: "*.json"
+        direction: output
+        label: 灵感点
+        extract_json_key: "data.idea_content"  # 会解析 JSON 并提取对应 key 的值保存
+        directory_depth: 2
 
 # ============================================================
 # 字段说明
@@ -59,6 +76,10 @@ stages:
 #     - exclude (可选) 文件排除规则,支持通配符或列表
 #                      示例: "*.tmp"
 #                      示例: ["*.log", ".DS_Store"]
+#     - direction (可选) 该文件的流入/流出方向(如 'input', 'output' 等)
+#     - label     (可选) 该文件的业务称呼/标签(如 '帖子输入', '灵感点' 等)
+#     - extract_json_key (可选) 针对 JSON 文件,配置要提取解析的 json key 路径(支持由于嵌套的 . 分隔,例如 'data.content')。提取的值会被记录在数据库中。
+#     - directory_depth  (可选) 定义这组规则生成的文件关联用的父目录深度(如 1 或 2,用来将不同子目录的关联文件合并到一行展示)。
 #
 # ============================================================
 # 工作流程

+ 44 - 0
migrate_raw.py

@@ -0,0 +1,44 @@
+import pymysql
+import os
+
+def migrate():
+    try:
+        conn = pymysql.connect(
+            host='rm-t4n8oyqunr5b4461s6o.mysql.singapore.rds.aliyuncs.com',
+            port=3306,
+            user='developer_saas',
+            password='developer_saas#Aiddit',
+            db='data_nexus'
+        )
+        with conn.cursor() as cursor:
+            print("Connected to DB")
+            
+            # 1. Add commit_message to data_records
+            try:
+                cursor.execute("ALTER TABLE data_records ADD COLUMN commit_message TEXT DEFAULT NULL;")
+                print("Added commit_message to data_records")
+            except Exception as e:
+                print(f"Skipping data_records.commit_message: {e}")
+
+            # 2. Add commit_message to data_versions
+            try:
+                cursor.execute("ALTER TABLE data_versions ADD COLUMN commit_message TEXT DEFAULT NULL;")
+                print("Added commit_message to data_versions")
+            except Exception as e:
+                print(f"Skipping data_versions.commit_message: {e}")
+
+            # 3. Add content_hash to data_records
+            try:
+                cursor.execute("ALTER TABLE data_records ADD COLUMN content_hash VARCHAR(64) DEFAULT NULL;")
+                print("Added content_hash to data_records")
+            except Exception as e:
+                print(f"Skipping data_records.content_hash: {e}")
+
+        conn.commit()
+        conn.close()
+        print("Migration done")
+    except Exception as e:
+        print(f"Global error: {e}")
+
+if __name__ == "__main__":
+    migrate()

+ 33 - 0
migrate_schema.py

@@ -0,0 +1,33 @@
+from sqlalchemy import text
+from app.database import engine
+
+def run_migrations():
+    with engine.connect() as conn:
+        print("Starting migrations...")
+        
+        # 1. Add commit_message to data_records
+        try:
+            conn.execute(text("ALTER TABLE data_records ADD COLUMN commit_message TEXT DEFAULT NULL;"))
+            print("Added commit_message to data_records")
+        except Exception as e:
+            print(f"Skipping commit_message for data_records: {e}")
+
+        # 2. Add commit_message to data_versions
+        try:
+            conn.execute(text("ALTER TABLE data_versions ADD COLUMN commit_message TEXT DEFAULT NULL;"))
+            print("Added commit_message to data_versions")
+        except Exception as e:
+            print(f"Skipping commit_message for data_versions: {e}")
+
+        # 3. Add content_hash to data_records
+        try:
+            conn.execute(text("ALTER TABLE data_records ADD COLUMN content_hash VARCHAR(64) DEFAULT NULL;"))
+            print("Added content_hash to data_records")
+        except Exception as e:
+            print(f"Skipping content_hash for data_records: {e}")
+        
+        conn.commit()
+        print("Migrations complete.")
+
+if __name__ == "__main__":
+    run_migrations()