Browse Source

feat:records.html页面、支持directory_depth、label的manifest.yaml

tanjingyu 1 week ago
parent
commit
8809daff3c
8 changed files with 1127 additions and 62 deletions
  1. 32 1
      app/main.py
  2. 24 1
      app/models.py
  3. 23 0
      app/schemas.py
  4. 125 1
      app/services/storage_service.py
  5. 13 2
      app/services/webhook_service.py
  6. 82 57
      app/static/console.html
  7. 807 0
      app/static/records.html
  8. 21 0
      manifest.yaml.example

+ 32 - 1
app/main.py

@@ -56,7 +56,11 @@ def build_file_tree(files: List[DataFile]) -> list:
                     "id": f.id,
                     "size": f.file_size,
                     "file_type": f.file_type,
-                    "sha": f.file_sha
+                    "sha": f.file_sha,
+                    "direction": f.direction,
+                    "label": f.label,
+                    "extracted_value": f.extracted_value,
+                    "group_key": f.group_key
                 })
             else:
                 # It's a folder
@@ -93,6 +97,12 @@ def filesystem_page():
     return FileResponse(os.path.join(STATIC_DIR, "index.html"), media_type="text/html")
 
 
+@app.get("/records")
+def records_page():
+    """Serve the data records UI."""
+    return FileResponse(os.path.join(STATIC_DIR, "records.html"), media_type="text/html")
+
+
 @app.get("/api/health")
 def health_check():
     """Health check endpoint."""
@@ -226,11 +236,32 @@ def get_stage_files(
                 "file_size": f.file_size,
                 "file_type": f.file_type,
                 "file_sha": f.file_sha,
+                "direction": f.direction,
+                "label": f.label,
+                "extracted_value": f.extracted_value,
+                "group_key": f.group_key,
             } for f in files]
         })
     return result
 
 
+@app.get("/projects/{project_id}/records", response_model=List[schemas.DataRecordOut])
+def list_data_records(
+    project_id: str,
+    stage: Optional[str] = None,
+    skip: int = 0,
+    limit: int = 100,
+    db: Session = Depends(get_db)
+):
+    """List data records for a project, optionally filtered by stage."""
+    from app.models import DataRecord
+    query = db.query(DataRecord).filter(DataRecord.project_id == project_id)
+    if stage:
+        query = query.filter(DataRecord.stage == stage)
+    records = query.order_by(DataRecord.created_at.desc()).offset(skip).limit(limit).all()
+    return records
+
+
 # ==================== Version APIs ====================
 
 @app.get("/projects/{project_id}/versions", response_model=List[schemas.DataVersionOut])

+ 24 - 1
app/models.py

@@ -1,4 +1,4 @@
-from sqlalchemy import Column, Integer, String, Text, ForeignKey, DateTime, BigInteger, UniqueConstraint
+from sqlalchemy import Column, Integer, String, Text, ForeignKey, DateTime, BigInteger, UniqueConstraint, JSON
 from sqlalchemy.orm import relationship
 from sqlalchemy.sql import func
 from ulid import ULID
@@ -50,6 +50,29 @@ class DataFile(Base):
     file_size = Column(BigInteger)
     file_type = Column(String(20))
     file_sha = Column(String(64), index=True)  # Git Blob SHA for deduplication
+    direction = Column(String(20), nullable=True)  # e.g., 'input' or 'output'
+    label = Column(String(100), nullable=True)     # e.g., '帖子输入'
+    extracted_value = Column(Text, nullable=True)  # extracted JSON value
+    group_key = Column(String(255), nullable=True) # Used to group related inputs and outputs
     created_at = Column(DateTime(timezone=True), server_default=func.now())
 
     version = relationship("DataVersion", back_populates="files")
+
+class DataRecord(Base):
+    __tablename__ = "data_records"
+
+    id = Column(String(26), primary_key=True, default=generate_ulid)
+    project_id = Column(String(26), ForeignKey("projects.id"))
+    version_id = Column(String(26), ForeignKey("data_versions.id"))
+    stage = Column(String(200), index=True)
+    commit_id = Column(String(64))
+    group_key = Column(String(255))
+    
+    inputs = Column(JSON)
+    outputs = Column(JSON)
+    
+    author = Column(String(50))
+    created_at = Column(DateTime(timezone=True), server_default=func.now())
+
+    version = relationship("DataVersion")
+    project = relationship("Project")

+ 23 - 0
app/schemas.py

@@ -29,6 +29,10 @@ class DataFileBase(BaseModel):
     file_size: int
     file_type: str
     file_sha: str
+    direction: Optional[str] = None
+    label: Optional[str] = None
+    extracted_value: Optional[str] = None
+    group_key: Optional[str] = None
 
 
 class DataFileOut(DataFileBase):
@@ -66,3 +70,22 @@ class DataVersionWithFiles(DataVersionOut):
 
 # Keep old name for backward compatibility
 DataVersion = DataVersionOut
+
+
+class DataRecordBase(BaseModel):
+    stage: str
+    commit_id: str
+    group_key: Optional[str] = None
+    inputs: list = []
+    outputs: list = []
+    author: Optional[str] = None
+
+
+class DataRecordOut(DataRecordBase):
+    id: str
+    project_id: str
+    version_id: str
+    created_at: datetime
+
+    class Config:
+        from_attributes = True

+ 125 - 1
app/services/storage_service.py

@@ -1,7 +1,7 @@
 import os
 from sqlalchemy.orm import Session
 from sqlalchemy.exc import IntegrityError
-from app.models import Project, DataVersion, DataFile
+from app.models import Project, DataVersion, DataFile, DataRecord
 from app.config import settings
 from app.services.gogs_client import GogsClient
 from app.services.oss_client import oss_client
@@ -94,6 +94,57 @@ class StorageService:
 
         return False
 
+    def aggregate_version_records(self, version: DataVersion):
+        """Aggregate files in a version into DataRecord groups based on parent directory."""
+        from collections import defaultdict
+        
+        # 1. Clean existing records for this version (idempotency)
+        self.db.query(DataRecord).filter(DataRecord.version_id == version.id).delete()
+        
+        files = self.db.query(DataFile).filter(DataFile.version_id == version.id).all()
+        
+        # 2. Group by dirname
+        groups = defaultdict(lambda: {"inputs": [], "outputs": []})
+        
+        for f in files:
+            # Group key falls back to immediate parent directory if not explicitly saved in f.group_key
+            group_key = f.group_key if f.group_key is not None else os.path.dirname(f.relative_path)
+            
+            file_data = {
+                "id": f.id,
+                "relative_path": f.relative_path,
+                "file_type": f.file_type,
+                "file_size": f.file_size,
+                "file_sha": f.file_sha,
+                "direction": f.direction,
+                "label": f.label,
+                "extracted_value": f.extracted_value,
+                "storage_path": f.storage_path
+            }
+            if f.direction == "input":
+                groups[group_key]["inputs"].append(file_data)
+            else:
+                # Treat 'output' or None as output by default for rendering purposes
+                groups[group_key]["outputs"].append(file_data)
+                
+        # 3. Insert aggregated records
+        for group_key, data in groups.items():
+            record = DataRecord(
+                project_id=version.project_id,
+                version_id=version.id,
+                stage=version.stage,
+                commit_id=version.commit_id,
+                group_key=group_key,
+                inputs=data["inputs"],
+                outputs=data["outputs"],
+                author=version.author,
+                # letting server_default handle created_at
+            )
+            self.db.add(record)
+            
+        self.db.commit()
+        logger.info(f"Aggregated version {version.id} into {len(groups)} DataRecord(s).")
+
     async def process_file_with_sha(
         self,
         version: DataVersion,
@@ -101,9 +152,14 @@ class StorageService:
         file_sha: str,
         owner: str,
         repo: str,
+        direction: str = None,
+        label: str = None,
+        extract_json_key: str = None,
+        directory_depth: int = None,
     ) -> bool:
         """Process a file and create a snapshot record.
 
+
         **Snapshot semantics**: a record is ALWAYS created regardless of
         whether the file changed.  This ensures every version is a
         self-contained snapshot of all declared output files.
@@ -127,8 +183,49 @@ class StorageService:
             .first()
         )
 
+        should_extract = bool(extract_json_key and relative_path.lower().endswith(".json"))
+        extracted_val = None
+
+        # Calculate group_key based on directory_depth
+        calc_group_key = os.path.dirname(relative_path)  # Default fallback
+        if directory_depth is not None and directory_depth > 0:
+            parts = relative_path.split("/")
+            # Remove filename
+            if len(parts) > 1:
+                parts = parts[:-1]
+                # Combine up to directory_depth
+                calc_group_key = "/".join(parts[:directory_depth])
+            else:
+                calc_group_key = "" # File is in root directory
+
+        async def _extract_val() -> str | None:
+            try:
+                content_bytes = await self.gogs.get_file_content(owner, repo, version.commit_id, relative_path)
+                if not content_bytes:
+                    return None
+                import json
+                parsed = json.loads(content_bytes.decode('utf-8'))
+                val = parsed
+                for key_part in extract_json_key.split("."):
+                    if isinstance(val, dict):
+                        val = val.get(key_part)
+                    else:
+                        val = None
+                        break
+                if val is not None:
+                    if isinstance(val, (dict, list)):
+                        return json.dumps(val, ensure_ascii=False)
+                    return str(val)
+            except Exception as e:
+                logger.warning(f"Failed to extract json key {extract_json_key} from {relative_path}: {e}")
+            return None
+
         if last_file and last_file.file_sha == file_sha:
             # ── Unchanged: reuse previous OSS key, still record a snapshot entry ──
+            # Re-extract if needed, or reuse previous extracted_val
+            if should_extract:
+                extracted_val = await _extract_val()
+            
             new_file = DataFile(
                 version_id=version.id,
                 relative_path=relative_path,
@@ -136,6 +233,10 @@ class StorageService:
                 file_size=last_file.file_size,
                 file_type=last_file.file_type,
                 file_sha=file_sha,
+                direction=direction,
+                label=label,
+                extracted_value=extracted_val,
+                group_key=calc_group_key,
             )
             self.db.add(new_file)
             self.db.commit()
@@ -157,6 +258,25 @@ class StorageService:
 
         oss_client.upload(oss_key, content)
 
+        if should_extract:
+            try:
+                import json
+                parsed = json.loads(content.decode('utf-8'))
+                val = parsed
+                for key_part in extract_json_key.split("."):
+                    if isinstance(val, dict):
+                        val = val.get(key_part)
+                    else:
+                        val = None
+                        break
+                if val is not None:
+                    if isinstance(val, (dict, list)):
+                        extracted_val = json.dumps(val, ensure_ascii=False)
+                    else:
+                        extracted_val = str(val)
+            except Exception as e:
+                logger.warning(f"Failed to extract json key {extract_json_key} from {relative_path}: {e}")
+
         new_file = DataFile(
             version_id=version.id,
             relative_path=relative_path,
@@ -164,6 +284,10 @@ class StorageService:
             file_size=file_size,
             file_type=os.path.splitext(relative_path)[1],
             file_sha=file_sha,
+            direction=direction,
+            label=label,
+            extracted_value=extracted_val,
+            group_key=calc_group_key,
         )
         self.db.add(new_file)
         self.db.commit()

+ 13 - 2
app/services/webhook_service.py

@@ -138,6 +138,8 @@ class WebhookService:
                     f"Stage '{stage_name}': no data changes detected (content and file set same). "
                     f"Version discarded."
                 )
+            else:
+                self.storage.aggregate_version_records(version)
 
     def _get_all_changed_files(self, payload: dict) -> set[str]:
         """Extract all added, modified, and removed files from all commits in payload."""
@@ -189,6 +191,11 @@ class WebhookService:
             patterns = output.get("pattern", "*")
             excludes = output.get("exclude")
 
+            direction = output.get("direction")
+            label = output.get("label")
+            extract_json_key = output.get("extract_json_key")
+            directory_depth = output.get("directory_depth")
+
             path_pattern = normalize_path(raw_path_pattern)
             is_dir = is_directory_pattern(raw_path_pattern)
 
@@ -212,7 +219,9 @@ class WebhookService:
                     if self._match_patterns(rel_name, patterns, excludes):
                         try:
                             changed = await self.storage.process_file_with_sha(
-                                version, file_path, file_info.get("sha"), owner, repo_name
+                                version, file_path, file_info.get("sha"), owner, repo_name,
+                                direction=direction, label=label, extract_json_key=extract_json_key,
+                                directory_depth=directory_depth
                             )
                             if changed:
                                 has_changes = True
@@ -230,7 +239,9 @@ class WebhookService:
                     if self._match_patterns(filename, patterns, excludes):
                         try:
                             changed = await self.storage.process_file_with_sha(
-                                version, path_pattern, file_info.get("sha"), owner, repo_name
+                                version, path_pattern, file_info.get("sha"), owner, repo_name,
+                                direction=direction, label=label, extract_json_key=extract_json_key,
+                                directory_depth=directory_depth
                             )
                             if changed:
                                 has_changes = True

+ 82 - 57
app/static/console.html

@@ -580,7 +580,6 @@
 
         .fg-children {
             display: none;
-            background: rgba(0, 0, 0, 0.1);
         }
 
         .fg-children.open {
@@ -826,14 +825,14 @@
             let h = '';
 
             S.versions.forEach((v, i) => {
-                const groups = groupFiles(v.files);
+                const tree = buildFileTree(v.files);
                 h += `<div class="version-card" style="animation-delay:${Math.min(i, 10) * 0.05}s">
             <div class="version-head">
                 <span class="commit-tag">${IC.commit} ${esc(v.commit_id.substring(0, 8))}</span>
                 <span class="v-author">${v.author ? esc(v.author) : ''}</span>
                 <span class="v-time" title="${fmtTime(v.created_at)}">${relTime(v.created_at)}</span>
             </div>
-            <div class="version-files">${renderGroups(groups, v)}</div>
+            <div class="version-files">${renderTree(tree, v, 0)}</div>
         </div>`;
             });
             if (S.hasMore) {
@@ -849,82 +848,102 @@
         }
 
         // ============ File Grouping ============
-        function groupFiles(files) {
-            if (!files || !files.length) return [];
+        function countFiles(node) {
+            let cnt = node.files.length;
+            Object.values(node.dirs).forEach(d => { cnt += countFiles(d); });
+            return cnt;
+        }
 
-            const topLevelGroups = {};
-            const rootFiles = [];
+        function buildFileTree(files) {
+            const root = { dirs: {}, files: [], path: '' };
+            if (!files || !files.length) return root;
 
             files.forEach(f => {
                 const parts = f.relative_path.split('/');
-                if (parts.length === 1) {
-                    rootFiles.push(f);
-                } else {
-                    const topDir = parts[0];
-                    if (!topLevelGroups[topDir]) topLevelGroups[topDir] = [];
-                    topLevelGroups[topDir].push(f);
-                }
-            });
-
-            const result = [];
-            Object.entries(topLevelGroups).forEach(([topDir, fls]) => {
-                let commonParts = fls[0].relative_path.split('/').slice(0, -1);
-                for (let i = 1; i < fls.length; i++) {
-                    const parts = fls[i].relative_path.split('/').slice(0, -1);
-                    let j = 0;
-                    while (j < commonParts.length && j < parts.length && commonParts[j] === parts[j]) {
-                        j++;
+                let cur = root;
+                for (let i = 0; i < parts.length - 1; i++) {
+                    const p = parts[i];
+                    if (!cur.dirs[p]) {
+                        const curPath = cur.path ? cur.path + '/' + p : p;
+                        cur.dirs[p] = { name: p, path: curPath, dirs: {}, files: [] };
                     }
-                    commonParts.length = j;
+                    cur = cur.dirs[p];
                 }
-                const groupName = commonParts.join('/');
-                result.push({ type: 'folder', name: groupName, path: groupName, files: fls });
+                cur.files.push(f);
             });
 
-            rootFiles.forEach(f => {
-                result.push({ type: 'file', file: f });
-            });
+            function compact(node) {
+                const dirKeys = Object.keys(node.dirs);
+                dirKeys.forEach(k => {
+                    compact(node.dirs[k]);
+                });
+
+                Object.keys(node.dirs).forEach(k => {
+                    let child = node.dirs[k];
+                    if (!child) return;
+
+                    let changed = true;
+                    while (changed) {
+                        changed = false;
+
+                        if (Object.keys(child.dirs).length === 1 && child.files.length === 0) {
+                            const onlyChildKey = Object.keys(child.dirs)[0];
+                            const onlyChild = child.dirs[onlyChildKey];
+
+                            child.name = child.name + '/' + onlyChild.name;
+                            child.path = onlyChild.path;
+                            child.dirs = onlyChild.dirs;
+                            child.files = onlyChild.files;
+                            changed = true;
+                        }
+
+                        if (Object.keys(child.dirs).length === 0 && child.files.length === 1) {
+                            node.files.push(child.files[0]);
+                            delete node.dirs[k];
+                            // since child is deleted, break inner loops
+                            changed = false;
+                        }
+                    }
+                });
+            }
+            compact(root);
 
-            result.sort((a, b) => {
-                if (a.type !== b.type) return a.type === 'folder' ? -1 : 1;
-                return (a.name || a.file.name).localeCompare(b.name || b.file.name);
-            });
-            return result;
+            return root;
         }
 
-        function renderGroups(groups, version) {
-            if (!groups.length) return '';
+        function renderTree(node, version, depth) {
             let h = '';
-            groups.forEach(g => {
-                if (g.type === 'folder') {
-                    const gid = 'fg_' + Math.random().toString(36).substr(2, 6);
-                    h += `
-            <div class="fg-header" onclick="toggleFG('${gid}')">
+
+            const dirKeys = Object.keys(node.dirs).sort((a, b) => a.localeCompare(b));
+            dirKeys.forEach(k => {
+                const d = node.dirs[k];
+                const gid = 'fg_' + Math.random().toString(36).substr(2, 6);
+                const fileCount = countFiles(d);
+                const padding = `padding-left: ${20 + depth * 24}px;`;
+
+                h += `
+            <div class="fg-header" style="${padding}" onclick="toggleFG('${gid}')">
                 <div class="fg-name-wrap">
                     <span class="fg-arrow" id="fa_${gid}">${IC.chevron}</span>
                     <span class="fg-icon">${IC.folder}</span>
-                    <span class="fg-name">${esc(g.name)}/</span>
-                    <span class="fg-count">${g.files.length} 个文件</span>
+                    <span class="fg-name" title="${esc(d.path)}">${esc(d.name)}/</span>
+                    <span class="fg-count">${fileCount} 个文件</span>
                 </div>
                 <div></div>
             </div>
             <div class="fg-children" id="${gid}">
-                ${g.files.map(f => fileRow(f, version, true, g.path)).join('')}
+                ${renderTree(d, version, depth + 1)}
             </div>`;
-                } else {
-                    h += fileRow(g.file, version, false, null);
-                }
             });
-            return h;
-        }
 
-        function fileRow(f, version, isChild, groupPath) {
-            const padding = isChild ? 'padding-left: 44px;' : '';
-            let displayName = f.name;
-            if (groupPath && f.relative_path.startsWith(groupPath + '/')) {
-                displayName = f.relative_path.substring(groupPath.length + 1);
-            }
-            return `
+            node.files.sort((a, b) => a.relative_path.localeCompare(b.relative_path)).forEach(f => {
+                let displayName = f.relative_path || f.name;
+                if (node.path && f.relative_path.startsWith(node.path + '/')) {
+                    displayName = f.relative_path.substring(node.path.length + 1);
+                }
+                const padding = `padding-left: ${depth === 0 ? 20 : 44 + (depth - 1) * 24}px;`;
+
+                h += `
     <div class="file-row" style="${padding}">
         <div class="file-name-col" title="${esc(f.relative_path)}">
             <span class="f-icon">${IC.file}</span>
@@ -935,6 +954,12 @@
             <a class="btn-dl" href="/files/${f.id}/content" download="${esc(f.name)}" onclick="event.stopPropagation();">${IC.download}</a>
         </div>
     </div>`;
+            });
+
+            if (depth === 0 && h === '') {
+                return '<div style="padding:14px 20px;font-size:13px;color:var(--text-muted)">暂无文件</div>';
+            }
+            return h;
         }
 
         function toggleFG(id) {

+ 807 - 0
app/static/records.html

@@ -0,0 +1,807 @@
+<!DOCTYPE html>
+<html lang="zh-CN">
+
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Data Nexus - 宽表数据试图</title>
+    <meta name="description" content="Data Nexus 宽表数据视图控制台">
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link
+        href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap"
+        rel="stylesheet">
+    <style>
+        *,
+        *::before,
+        *::after {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+
+        :root {
+            --bg-base: #080c18;
+            --bg-sidebar: #0c1222;
+            --bg-surface: #111827;
+            --bg-card: #151f32;
+            --bg-card-head: rgba(0, 0, 0, 0.2);
+            --bg-hover: rgba(255, 255, 255, 0.04);
+            --bg-active: rgba(99, 179, 237, 0.08);
+            --border: rgba(255, 255, 255, 0.06);
+            --border-card: rgba(255, 255, 255, 0.08);
+            --border-active: rgba(99, 179, 237, 0.35);
+            --text-primary: #e2e8f0;
+            --text-secondary: #8b9ab5;
+            --text-muted: #556477;
+            --accent: #63b3ed;
+            --accent-light: #90cdf4;
+            --accent-dim: rgba(99, 179, 237, 0.12);
+            --green: #68d391;
+            --orange: #f6ad55;
+            --purple: #b794f4;
+            --radius: 8px;
+            --sidebar-w: 280px;
+        }
+
+        body {
+            font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'PingFang SC', sans-serif;
+            background: var(--bg-base);
+            color: var(--text-primary);
+            height: 100vh;
+            overflow: hidden;
+            line-height: 1.5;
+        }
+
+        ::-webkit-scrollbar {
+            width: 6px;
+            height: 6px;
+        }
+
+        ::-webkit-scrollbar-track {
+            background: transparent;
+        }
+
+        ::-webkit-scrollbar-thumb {
+            background: rgba(255, 255, 255, 0.1);
+            border-radius: 3px;
+        }
+
+        ::-webkit-scrollbar-thumb:hover {
+            background: rgba(255, 255, 255, 0.2);
+        }
+
+        .app {
+            display: grid;
+            grid-template-columns: var(--sidebar-w) 1fr;
+            height: 100vh;
+        }
+
+        .sidebar {
+            background: var(--bg-sidebar);
+            border-right: 1px solid var(--border);
+            display: flex;
+            flex-direction: column;
+            overflow: hidden;
+        }
+
+        .sidebar-header {
+            display: flex;
+            align-items: center;
+            gap: 10px;
+            padding: 20px 20px 16px;
+            flex-shrink: 0;
+        }
+
+        .sidebar-header svg {
+            width: 26px;
+            height: 26px;
+            color: var(--accent);
+            flex-shrink: 0;
+        }
+
+        .sidebar-header span {
+            font-size: 16px;
+            font-weight: 700;
+            background: linear-gradient(135deg, var(--accent-light), var(--purple));
+            -webkit-background-clip: text;
+            background-clip: text;
+            -webkit-text-fill-color: transparent;
+        }
+
+        .sidebar-divider {
+            height: 1px;
+            background: var(--border);
+            margin: 8px 16px;
+            flex-shrink: 0;
+        }
+
+        .stage-tree-wrap {
+            flex: 1;
+            overflow-y: auto;
+            padding: 0 8px 16px;
+        }
+
+        .tree-branch-header {
+            display: flex;
+            align-items: center;
+            gap: 4px;
+            padding: 7px 10px;
+            cursor: pointer;
+            border-radius: 6px;
+            transition: background 0.12s;
+            user-select: none;
+            font-size: 13px;
+            color: var(--text-secondary);
+        }
+
+        .tree-branch-header:hover {
+            background: var(--bg-hover);
+        }
+
+        .tree-arrow {
+            width: 16px;
+            height: 16px;
+            color: var(--text-muted);
+            transition: transform 0.2s;
+            flex-shrink: 0;
+        }
+
+        .tree-arrow.open {
+            transform: rotate(90deg);
+        }
+
+        .tree-children {
+            display: none;
+            padding-left: 8px;
+            margin-left: 12px;
+            border-left: 1px solid var(--border);
+        }
+
+        .tree-children.open {
+            display: block;
+        }
+
+        .tree-leaf {
+            display: flex;
+            align-items: center;
+            gap: 8px;
+            padding: 7px 10px 7px 12px;
+            cursor: pointer;
+            border-radius: 6px;
+            transition: all 0.12s;
+            font-size: 13px;
+            color: var(--text-secondary);
+        }
+
+        .tree-leaf:hover {
+            background: var(--bg-hover);
+            color: var(--text-primary);
+        }
+
+        .tree-leaf.active {
+            background: var(--bg-active);
+            color: var(--accent);
+            font-weight: 500;
+        }
+
+        .tree-dot {
+            width: 5px;
+            height: 5px;
+            border-radius: 50%;
+            background: var(--text-muted);
+            flex-shrink: 0;
+        }
+
+        .tree-leaf.active .tree-dot {
+            background: var(--accent);
+        }
+
+        .tree-count {
+            margin-left: auto;
+            font-size: 11px;
+            color: var(--text-muted);
+            background: rgba(255, 255, 255, 0.04);
+            padding: 1px 6px;
+            border-radius: 4px;
+        }
+
+        .content {
+            display: flex;
+            flex-direction: column;
+            height: 100vh;
+            overflow: hidden;
+        }
+
+        .content-header {
+            flex-shrink: 0;
+            padding: 18px 28px;
+            border-bottom: 1px solid var(--border);
+            background: rgba(12, 18, 34, 0.6);
+            backdrop-filter: blur(12px);
+            display: flex;
+            align-items: center;
+            justify-content: space-between;
+            min-height: 60px;
+        }
+
+        .stage-path {
+            display: flex;
+            align-items: center;
+            gap: 6px;
+            font-size: 14px;
+        }
+
+        .stage-path .sep {
+            color: var(--text-muted);
+            font-size: 11px;
+        }
+
+        .stage-path .seg {
+            color: var(--text-secondary);
+        }
+
+        .stage-path .seg:last-child {
+            color: var(--text-primary);
+            font-weight: 600;
+        }
+
+        .header-info {
+            font-size: 12px;
+            color: var(--text-muted);
+        }
+
+        .content-body {
+            flex: 1;
+            overflow: auto;
+            padding: 24px;
+        }
+
+        /* Welcome & Loading state */
+        .state-box {
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+            justify-content: center;
+            height: 100%;
+            text-align: center;
+            color: var(--text-muted);
+            padding: 40px;
+        }
+
+        .state-box svg {
+            width: 56px;
+            height: 56px;
+            margin-bottom: 20px;
+            opacity: 0.25;
+        }
+
+        .state-box h2 {
+            font-size: 17px;
+            font-weight: 600;
+            color: var(--text-secondary);
+            margin-bottom: 6px;
+        }
+
+        .state-box p {
+            font-size: 13px;
+        }
+
+        .spinner {
+            width: 28px;
+            height: 28px;
+            border: 3px solid var(--border);
+            border-top-color: var(--accent);
+            border-radius: 50%;
+            animation: spin 0.7s linear infinite;
+            margin-bottom: 16px;
+        }
+
+        @keyframes spin {
+            to {
+                transform: rotate(360deg);
+            }
+        }
+
+        /* Records Table */
+        .records-table {
+            width: 100%;
+            border-collapse: separate;
+            border-spacing: 0;
+            background: var(--bg-card);
+            border: 1px solid var(--border-card);
+            border-radius: var(--radius);
+        }
+
+        .records-table th {
+            background: var(--bg-card-head);
+            color: var(--text-secondary);
+            font-size: 13px;
+            font-weight: 600;
+            text-align: left;
+            padding: 14px 16px;
+            border-bottom: 1px solid var(--border);
+            white-space: nowrap;
+        }
+
+        .records-table td {
+            border-bottom: 1px solid var(--border);
+            padding: 16px;
+            vertical-align: top;
+        }
+
+        .records-table tr:last-child td {
+            border-bottom: none;
+        }
+
+        .records-table td.meta-cell {
+            white-space: nowrap;
+        }
+
+        /* Bubble Tree (File rendering) */
+        .bubble-tree {
+            background: rgba(0, 0, 0, 0.15);
+            border-radius: 6px;
+            border: 1px solid var(--border);
+            overflow: hidden;
+            font-size: 12px;
+            min-width: 200px;
+        }
+
+        .fg-header {
+            display: flex;
+            align-items: center;
+            padding: 8px 10px;
+            gap: 6px;
+            cursor: pointer;
+            border-bottom: 1px solid var(--border);
+            transition: background 0.1s;
+        }
+
+        .fg-header:hover {
+            background: var(--bg-hover);
+        }
+
+        .fg-arrow {
+            width: 12px;
+            height: 12px;
+            color: var(--text-muted);
+            transition: transform 0.2s;
+            flex-shrink: 0;
+        }
+
+        .fg-arrow.open {
+            transform: rotate(90deg);
+        }
+
+        .fg-icon {
+            width: 14px;
+            height: 14px;
+            color: var(--orange);
+            flex-shrink: 0;
+        }
+
+        .fg-name {
+            color: var(--text-primary);
+            font-weight: 500;
+        }
+
+        .file-row {
+            display: flex;
+            align-items: center;
+            justify-content: space-between;
+            padding: 6px 10px;
+            border-bottom: 1px solid var(--border);
+            gap: 10px;
+            transition: background 0.1s;
+        }
+
+        .file-row:hover {
+            background: var(--bg-hover);
+        }
+
+        .file-row:last-child {
+            border-bottom: none;
+        }
+
+        .file-name-col {
+            display: flex;
+            align-items: center;
+            gap: 6px;
+            min-width: 0;
+        }
+
+        .f-icon {
+            width: 14px;
+            height: 14px;
+            color: var(--text-muted);
+            flex-shrink: 0;
+        }
+
+        .f-name {
+            color: var(--text-primary);
+            white-space: nowrap;
+            overflow: hidden;
+            text-overflow: ellipsis;
+            max-width: 200px;
+        }
+
+        .btn-dl {
+            display: inline-flex;
+            align-items: center;
+            padding: 3px 6px;
+            border-radius: 4px;
+            background: var(--accent-dim);
+            color: var(--accent);
+            text-decoration: none;
+            transition: all 0.15s;
+        }
+
+        .btn-dl:hover {
+            background: rgba(99, 179, 237, 0.2);
+        }
+
+        .btn-dl svg {
+            width: 12px;
+            height: 12px;
+        }
+
+        .commit-badge {
+            display: inline-flex;
+            align-items: center;
+            gap: 4px;
+            font-family: 'JetBrains Mono', monospace;
+            background: var(--accent-dim);
+            color: var(--accent);
+            padding: 3px 8px;
+            border-radius: 4px;
+            font-size: 12px;
+            margin-bottom: 4px;
+        }
+
+        .meta-text {
+            font-size: 12px;
+            color: var(--text-muted);
+            margin-bottom: 2px;
+        }
+
+        .empty-cell {
+            color: var(--text-muted);
+            font-size: 13px;
+            font-style: italic;
+        }
+
+        .load-more {
+            display: flex;
+            justify-content: center;
+            padding: 20px 0;
+        }
+
+        .load-more-btn {
+            padding: 9px 28px;
+            border-radius: 6px;
+            border: 1px solid var(--border);
+            background: var(--bg-card);
+            color: var(--text-secondary);
+            cursor: pointer;
+        }
+
+        .load-more-btn:hover {
+            background: var(--bg-hover);
+            color: var(--text-primary);
+        }
+    </style>
+</head>
+
+<body>
+    <div class="app">
+        <aside class="sidebar">
+            <div class="sidebar-header">
+                <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                    <rect x="3" y="3" width="18" height="18" rx="2" ry="2" />
+                    <line x1="3" y1="9" x2="21" y2="9" />
+                    <line x1="9" y1="21" x2="9" y2="9" />
+                </svg>
+                <span>宽表数据视图</span>
+            </div>
+            <div class="sidebar-divider"></div>
+            <div class="stage-tree-wrap" id="stageTreeWrap"></div>
+        </aside>
+        <main class="content">
+            <div class="content-header">
+                <div class="stage-path" id="stagePath"><span class="seg" style="color:var(--text-muted)">选择左侧数据阶段</span>
+                </div>
+                <div class="header-info" id="headerInfo"></div>
+            </div>
+            <div class="content-body" id="contentBody">
+                <div class="state-box">
+                    <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5">
+                        <rect x="3" y="3" width="18" height="18" rx="2" ry="2" />
+                        <line x1="3" y1="9" x2="21" y2="9" />
+                        <line x1="9" y1="21" x2="9" y2="9" />
+                    </svg>
+                    <h2>欢迎使用宽表视图控制台</h2>
+                    <p>从左侧选择阶段,即可查看以 group key 聚合排列的数据网格表</p>
+                </div>
+            </div>
+        </main>
+    </div>
+
+    <script>
+        const IC = {
+            commit: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><circle cx="12" cy="12" r="4"/><line x1="1.05" y1="12" x2="7" y2="12"/><line x1="17.01" y1="12" x2="22.96" y2="12"/></svg>',
+            file: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><polyline points="14 2 14 8 20 8"/></svg>',
+            folder: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M22 19a2 2 0 01-2 2H4a2 2 0 01-2-2V5a2 2 0 012-2h5l2 3h9a2 2 0 012 2z"/></svg>',
+            download: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M21 15v4a2 2 0 01-2 2H5a2 2 0 01-2-2v-4"/><polyline points="7 10 12 15 17 10"/><line x1="12" y1="15" x2="12" y2="3"/></svg>',
+            chevron: '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><polyline points="9 18 15 12 9 6"/></svg>',
+        };
+
+        const PAGE_SIZE = 20;
+        let S = { stages: [], stageProjectMap: {}, stage: null, records: [], skip: 0, hasMore: true, loading: false };
+
+        const $ = id => document.getElementById(id);
+        function esc(s) { if (!s) return ''; const d = document.createElement('div'); d.textContent = s; return d.innerHTML; }
+        function relTime(iso) {
+            if (!iso) return '';
+            const m = Math.floor((Date.now() - new Date(iso).getTime()) / 60000);
+            if (m < 1) return '刚刚'; if (m < 60) return m + ' 分钟前';
+            const h = Math.floor(m / 60); if (h < 24) return h + ' 小时前';
+            const d = Math.floor(h / 24); if (d < 30) return d + ' 天前';
+            const date = new Date(iso);
+            return `${date.getFullYear()}-${String(date.getMonth() + 1).padStart(2, '0')}-${String(date.getDate()).padStart(2, '0')} ${String(date.getHours()).padStart(2, '0')}:${String(date.getMinutes()).padStart(2, '0')}`;
+        }
+
+        async function api(url) { const r = await fetch(url); if (!r.ok) throw new Error(r.status); return r.json(); }
+
+        async function loadAllStages() {
+            $('stageTreeWrap').innerHTML = '<div style="padding:16px;text-align:center;"><div class="spinner" style="margin:0 auto 8px;"></div><span style="font-size:12px;color:var(--text-muted)">加载中...</span></div>';
+            try {
+                S.stages = await api('/stages/all');
+                S.stageProjectMap = {};
+                S.stages.forEach(st => { S.stageProjectMap[st.name] = st.project_id; });
+                renderStageTree();
+            } catch (e) { $('stageTreeWrap').innerHTML = '<div style="padding:16px;color:#fc8181;font-size:13px;">加载失败</div>'; }
+        }
+
+        function buildTree(stages) {
+            const root = [];
+            for (const st of stages) {
+                const parts = st.name.split('/');
+                let cur = root;
+                for (let i = 0; i < parts.length; i++) {
+                    let node = cur.find(n => n.label === parts[i]);
+                    if (!node) { node = { label: parts[i], children: [] }; cur.push(node); }
+                    if (i === parts.length - 1) { node.stage = st.name; node.count = st.version_count; }
+                    cur = node.children;
+                }
+            }
+            return root;
+        }
+
+        function renderStageTree() {
+            const tree = buildTree(S.stages);
+            $('stageTreeWrap').innerHTML = tree.length ? renderNodes(tree) : '<div style="padding:16px;font-size:13px;color:var(--text-muted)">暂无数据阶段</div>';
+        }
+
+        function renderNodes(nodes) {
+            let h = '';
+            for (const n of nodes) {
+                if (n.stage && n.children.length === 0) {
+                    h += `<div class="tree-leaf" data-stage="${esc(n.stage)}" onclick="selectStage(this, '${esc(n.stage)}')">
+                    <span class="tree-dot"></span><span>${esc(n.label)}</span>
+                    <span class="tree-count">${n.count || ''}</span>
+                </div>`;
+                } else {
+                    const id = 'tb_' + Math.random().toString(36).substr(2, 6);
+                    h += `<div>
+                    <div class="tree-branch-header" onclick="toggleBranch('${id}', this)">
+                        <span class="tree-arrow" id="a_${id}">${IC.chevron}</span><span>${esc(n.label)}</span>
+                    </div>
+                    <div class="tree-children" id="${id}">${renderNodes(n.children)}</div>
+                </div>`;
+                }
+            }
+            return h;
+        }
+
+        function toggleBranch(id) {
+            const ch = $(id), ar = $('a_' + id);
+            if (ch) ch.classList.toggle('open');
+            if (ar) ar.classList.toggle('open');
+        }
+
+        function selectStage(el, stageName) {
+            document.querySelectorAll('.tree-leaf.active').forEach(e => e.classList.remove('active'));
+            el.classList.add('active');
+            S.stage = stageName;
+            S.records = []; S.skip = 0; S.hasMore = true;
+            updateHeader();
+            loadRecords();
+        }
+
+        function updateHeader() {
+            if (!S.stage) {
+                $('stagePath').innerHTML = '<span class="seg" style="color:var(--text-muted)">选择左侧数据阶段</span>';
+                return;
+            }
+            const parts = S.stage.split('/');
+            $('stagePath').innerHTML = parts.map((p, i) => `${i > 0 ? '<span class="sep">/</span>' : ''}<span class="seg">${esc(p)}</span>`).join('');
+        }
+
+        async function loadRecords(append = false) {
+            if (S.loading) return;
+            S.loading = true;
+            if (!append) $('contentBody').innerHTML = '<div class="state-box"><div class="spinner"></div><p>加载中...</p></div>';
+
+            try {
+                const pid = S.stageProjectMap[S.stage];
+                const data = await api(`/projects/${pid}/records?stage=${encodeURIComponent(S.stage)}&skip=${S.skip}&limit=${PAGE_SIZE}`);
+
+                if (!append) S.records = [];
+                S.records.push(...data);
+                S.hasMore = data.length >= PAGE_SIZE;
+                S.skip += data.length;
+
+                renderTable();
+            } catch (e) {
+                if (!append) $('contentBody').innerHTML = '<div class="state-box"><p style="color:#fc8181;">加载失败: ' + esc(e.message) + '</p></div>';
+            }
+            S.loading = false;
+        }
+
+        /* ------- Bubble Directory Tree Builder ------- */
+        function buildFileTree(files) {
+            const root = { dirs: {}, files: [], path: '' };
+            if (!files || !files.length) return root;
+
+            files.forEach(f => {
+                const parts = f.relative_path.split('/');
+                let cur = root;
+                for (let i = 0; i < parts.length - 1; i++) {
+                    const p = parts[i];
+                    if (!cur.dirs[p]) {
+                        const curPath = cur.path ? cur.path + '/' + p : p;
+                        cur.dirs[p] = { name: p, path: curPath, dirs: {}, files: [] };
+                    }
+                    cur = cur.dirs[p];
+                }
+                cur.files.push(f);
+            });
+
+            // Compact single-child directories
+            function compact(node) {
+                Object.keys(node.dirs).forEach(k => compact(node.dirs[k]));
+                Object.keys(node.dirs).forEach(k => {
+                    let child = node.dirs[k];
+                    if (!child) return;
+                    let changed = true;
+                    while (changed) {
+                        changed = false;
+                        if (Object.keys(child.dirs).length === 1 && child.files.length === 0) {
+                            const onlyChildKey = Object.keys(child.dirs)[0];
+                            const onlyChild = child.dirs[onlyChildKey];
+                            child.name = child.name + '/' + onlyChild.name;
+                            child.path = onlyChild.path;
+                            child.dirs = onlyChild.dirs;
+                            child.files = onlyChild.files;
+                            changed = true;
+                        }
+                    }
+                });
+            }
+            compact(root);
+            return root;
+        }
+
+        function renderSubTree(node, depth) {
+            let h = '';
+            const dirKeys = Object.keys(node.dirs).sort((a, b) => a.localeCompare(b));
+            dirKeys.forEach(k => {
+                const d = node.dirs[k];
+                const gid = 'fg_' + Math.random().toString(36).substr(2, 6);
+                const padding = `padding-left: ${10 + depth * 14}px;`;
+                h += `
+                <div class="fg-header" style="${padding}" onclick="toggleBranch('${gid}')">
+                    <span class="fg-arrow" id="a_${gid}">${IC.chevron}</span>
+                    <span class="fg-icon">${IC.folder}</span>
+                    <span class="fg-name" title="${esc(d.path)}">${esc(d.name)}/</span>
+                </div>
+                <div class="tree-children open" id="${gid}" style="margin-left:0; padding-left:0; border-left:none;">
+                    ${renderSubTree(d, depth + 1)}
+                </div>`;
+            });
+
+            node.files.sort((a, b) => a.relative_path.localeCompare(b.relative_path)).forEach(f => {
+                let displayName = f.relative_path;
+                if (node.path && f.relative_path.startsWith(node.path + '/')) {
+                    displayName = f.relative_path.substring(node.path.length + 1);
+                } else if (f.relative_path.includes('/')) {
+                    displayName = f.relative_path.split('/').pop();
+                }
+                const padding = `padding-left: ${10 + depth * 14 + (dirKeys.length > 0 ? 18 : 0)}px;`;
+                h += `
+                <div class="file-row" style="${padding}">
+                    <div class="file-name-col" title="${esc(f.relative_path)}">
+                        <span class="f-icon">${IC.file}</span>
+                        <span class="f-name">${esc(displayName)}</span>
+                    </div>
+                    <a class="btn-dl" href="/files/${f.id}/content" download><svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M21 15v4a2 2 0 01-2 2H5a2 2 0 01-2-2v-4"/><polyline points="7 10 12 15 17 10"/><line x1="12" y1="15" x2="12" y2="3"/></svg></a>
+                </div>`;
+                if (f.extracted_value) {
+                    h += `<div style="padding-left: ${10 + depth * 14 + (dirKeys.length > 0 ? 18 : 0)}px; padding-right:10px; padding-bottom:6px; margin-top:-4px; color:var(--text-muted); font-size:11px; white-space:pre-wrap; word-break:break-all;">↳ Extract: ${esc(f.extracted_value)}</div>`;
+                }
+            });
+            return h;
+        }
+
+        function renderBubbleTree(files) {
+            if (!files || files.length === 0) return '<div class="empty-cell">无数据</div>';
+            const root = buildFileTree(files);
+            return `<div class="bubble-tree" style="margin-bottom:8px;">${renderSubTree(root, 0)}</div>`;
+        }
+
+        /* ------- Render Table ------- */
+        function renderTable() {
+            if (!S.records.length) {
+                $('contentBody').innerHTML = '<div class="state-box"><h2>暂无数据</h2><p>找不到符合要求的阶段记录数据</p></div>';
+                return;
+            }
+
+            // Extract dynamic columns
+            const inLabels = new Set();
+            const outLabels = new Set();
+
+            S.records.forEach(r => {
+                (r.inputs || []).forEach(f => inLabels.add(f.label || '未命名'));
+                (r.outputs || []).forEach(f => outLabels.add(f.label || '未命名'));
+            });
+
+            const sortedInLabels = Array.from(inLabels).sort();
+            const sortedOutLabels = Array.from(outLabels).sort();
+
+            let h = `<div style="overflow-x:auto;">
+            <table class="records-table">
+                <thead>
+                    <tr>
+                        <th>Metadata</th>`;
+            sortedInLabels.forEach(lbl => h += `<th>${esc(lbl)} (输入)</th>`);
+            sortedOutLabels.forEach(lbl => h += `<th>${esc(lbl)} (输出)</th>`);
+            h += `      </tr>
+                </thead>
+                <tbody>`;
+
+            S.records.forEach(r => {
+                h += `<tr><td class="meta-cell">
+                <div class="commit-badge">${IC.commit} ${esc(r.commit_id.substring(0, 8))}</div>
+                <div class="meta-text">By: ${esc(r.author || 'unknown')}</div>
+                <div class="meta-text">Time: ${relTime(r.created_at)}</div>
+                ${r.group_key ? `<div class="meta-text" style="color:var(--orange)">Grp: ${esc(r.group_key)}</div>` : ''}
+            </td>`;
+
+                sortedInLabels.forEach(lbl => {
+                    const groupFiles = (r.inputs || []).filter(f => (f.label || '未命名') === lbl);
+                    h += `<td>${renderBubbleTree(groupFiles)}</td>`;
+                });
+
+                sortedOutLabels.forEach(lbl => {
+                    const groupFiles = (r.outputs || []).filter(f => (f.label || '未命名') === lbl);
+                    h += `<td>${renderBubbleTree(groupFiles)}</td>`;
+                });
+                h += `</tr>`;
+            });
+
+            h += `  </tbody>
+            </table>
+        </div>`;
+
+            if (S.hasMore) {
+                h += '<div class="load-more"><button class="load-more-btn" onclick="loadRecords(true)">加载更多</button></div>';
+            }
+
+            $('contentBody').innerHTML = h;
+        }
+
+        loadAllStages();
+    </script>
+</body>
+
+</html>

+ 21 - 0
manifest.yaml.example

@@ -40,6 +40,23 @@ stages:
       # 示例 D:也可以指定单个文件
       - path: final_report.docx
 
+  # ---------- 阶段 4:带有元数据与 JSON 值提取的数据 ----------
+  - name: enhanced_data
+    outputs:
+      # 示例 E:指定文件为“输入”,并打上标签
+      - path: data/input/article.md
+        direction: input        # 指定是 input 还是 output
+        label: 帖子输入         # 指定该数据的业务名称(标签)
+
+      # 示例 F:指定文件为“输出”,并且如果是 JSON 文件,可以提取特定 key 的值
+      #          同时指定深度 directory_depth: 2,这样如果文件在 `data/output/foo/bar.json`
+      #          它的 group_key 会被设置成 `data/output` 而不是默认的 `data/output/foo`
+      - path: data/output/
+        pattern: "*.json"
+        direction: output
+        label: 灵感点
+        extract_json_key: "data.idea_content"  # 会解析 JSON 并提取对应 key 的值保存
+        directory_depth: 2
 
 # ============================================================
 # 字段说明
@@ -59,6 +76,10 @@ stages:
 #     - exclude (可选) 文件排除规则,支持通配符或列表
 #                      示例: "*.tmp"
 #                      示例: ["*.log", ".DS_Store"]
+#     - direction (可选) 该文件的流入/流出方向(如 'input', 'output' 等)
+#     - label     (可选) 该文件的业务称呼/标签(如 '帖子输入', '灵感点' 等)
+#     - extract_json_key (可选) 针对 JSON 文件,配置要提取解析的 json key 路径(支持由于嵌套的 . 分隔,例如 'data.content')。提取的值会被记录在数据库中。
+#     - directory_depth  (可选) 定义这组规则生成的文件关联用的父目录深度(如 1 或 2,用来将不同子目录的关联文件合并到一行展示)。
 #
 # ============================================================
 # 工作流程