3 ay önce · d2c1de3774
--- a/app/services/webhook_service.py
+++ b/app/services/webhook_service.py
@@ -136,7 +136,9 @@ class WebhookService:
 
				 
			
 
				         for output in outputs:
			
 
				             raw_path_pattern = output.get("path", "")
			
 
				-            match_pattern = output.get("pattern", "*")
			
 
				+            # Support both string and list for pattern and exclude
			
 
				+            patterns = output.get("pattern", "*")
			
 
				+            excludes = output.get("exclude")
			
 
				 
			
 
				             path_pattern = normalize_path(raw_path_pattern)
			
 
				             is_dir = is_directory_pattern(raw_path_pattern)
			
@@ -144,15 +146,21 @@ class WebhookService:
 
				             if is_dir:
			
 
				                 # Directory pattern: fetch only this directory's files
			
 
				                 dir_path = path_pattern.rstrip("/")
			
 
				-                logger.info(f"Fetching directory: {dir_path} with pattern: {match_pattern}")
			
 
				+                logger.info(f"Fetching directory: {dir_path} with patterns: {patterns}, excludes: {excludes}")
			
 
				 
			
 
				                 files = await self.gogs.get_directory_tree(owner, repo_name, commit_id, dir_path)
			
 
				 
			
 
				                 for file_info in files:
			
 
				                     file_path = file_info.get("path")
			
 
				-                    # Apply pattern matching
			
 
				-                    rel_name = file_path[len(dir_path) + 1:] if file_path.startswith(dir_path + "/") else file_path
			
 
				-                    if fnmatch.fnmatch(rel_name, match_pattern):
			
 
				+                    # Calculate name relative to the watched directory
			
 
				+                    # e.g. dir_path="a", file_path="a/b.txt" -> rel_name="b.txt"
			
 
				+                    rel_name = (
			
 
				+                        file_path[len(dir_path) + 1 :]
			
 
				+                        if file_path.startswith(dir_path + "/")
			
 
				+                        else file_path
			
 
				+                    )
			
 
				+
			
 
				+                    if self._match_patterns(rel_name, patterns, excludes):
			
 
				                         try:
			
 
				                             changed = await self.storage.process_file_with_sha(
			
 
				                                 version, file_path, file_info.get("sha"), owner, repo_name
			
@@ -167,15 +175,45 @@ class WebhookService:
 
				 
			
 
				                 file_info = await self.gogs.get_file_info(owner, repo_name, commit_id, path_pattern)
			
 
				                 if file_info:
			
 
				-                    try:
			
 
				-                        changed = await self.storage.process_file_with_sha(
			
 
				-                            version, path_pattern, file_info.get("sha"), owner, repo_name
			
 
				-                        )
			
 
				-                        if changed:
			
 
				-                            has_changes = True
			
 
				-                    except Exception as e:
			
 
				-                        logger.error(f"Failed to process file {path_pattern}: {e}")
			
 
				+                    # Apply pattern matching to the filename for consistency
			
 
				+                    import os
			
 
				+                    filename = os.path.basename(path_pattern)
			
 
				+                    if self._match_patterns(filename, patterns, excludes):
			
 
				+                        try:
			
 
				+                            changed = await self.storage.process_file_with_sha(
			
 
				+                                version, path_pattern, file_info.get("sha"), owner, repo_name
			
 
				+                            )
			
 
				+                            if changed:
			
 
				+                                has_changes = True
			
 
				+                        except Exception as e:
			
 
				+                            logger.error(f"Failed to process file {path_pattern}: {e}")
			
 
				                 else:
			
 
				                     logger.warning(f"File not found: {path_pattern}")
			
 
				 
			
 
				         return has_changes
			
 
				+
			
 
				+    def _match_patterns(
			
 
				+        self,
			
 
				+        filename: str,
			
 
				+        include_patterns: str | list[str],
			
 
				+        exclude_patterns: str | list[str] | None = None,
			
 
				+    ) -> bool:
			
 
				+        """Helper to match filename against multiple include and exclude glob patterns."""
			
 
				+        # Normalize to lists
			
 
				+        includes = (
			
 
				+            [include_patterns] if isinstance(include_patterns, str) else include_patterns
			
 
				+        )
			
 
				+        excludes = []
			
 
				+        if exclude_patterns:
			
 
				+            excludes = (
			
 
				+                [exclude_patterns] if isinstance(exclude_patterns, str) else exclude_patterns
			
 
				+            )
			
 
				+
			
 
				+        # 1. Check if it matches ANY include pattern (OR logic)
			
 
				+        is_included = any(fnmatch.fnmatch(filename, p) for p in includes)
			
 
				+        if not is_included:
			
 
				+            return False
			
 
				+
			
 
				+        # 2. Check if it matches ANY exclude pattern (OR logic: any match means reject)
			
 
				+        is_excluded = any(fnmatch.fnmatch(filename, p) for p in excludes)
			
 
				+        return not is_excluded
			
--- a/manifest.yaml.example
+++ b/manifest.yaml.example
@@ -0,0 +1,77 @@
 
				+# ============================================================
			
 
				+# Data Nexus - manifest.yaml 示例
			
 
				+# 将此文件重命名为 manifest.yaml 放到你的 Git 仓库根目录
			
 
				+# Data Nexus 会在每次 push 时自动读取并处理
			
 
				+# ============================================================
			
 
				+
			
 
				+# 项目名称（必填）
			
 
				+# 同一个 project_name 的数据会归档在同一个项目下
			
 
				+project_name: what-创作
			
 
				+
			
 
				+# 每个 stage 代表数据流水线中的一个环节
			
 
				+stages:
			
 
				+  # ---------- 阶段 1：原始数据 ----------
			
 
				+  - name: raw
			
 
				+    outputs:
			
 
				+      # 示例 A：收集目录下的所有文件（默认方式）
			
 
				+      - path: data/raw/
			
 
				+        pattern: "*"
			
 
				+
			
 
				+  # ---------- 阶段 2：清洗后数据 ----------
			
 
				+  - name: cleaned
			
 
				+    outputs:
			
 
				+      # 示例 B：收集特定类型文件，并排除临时文件
			
 
				+      - path: data/cleaned/
			
 
				+        pattern: "*.csv"        # 只收集 CSV
			
 
				+        exclude: "*.tmp"        # 排除 .tmp 结尾的中间文件
			
 
				+
			
 
				+  # ---------- 阶段 3：多模式匹配与复杂过滤 ----------
			
 
				+  - name: output
			
 
				+    outputs:
			
 
				+      # 示例 C：使用列表指定多个包含模式和多个排除模式
			
 
				+      - path: results/
			
 
				+        pattern: 
			
 
				+          - "*.pdf"
			
 
				+          - "*.xlsx"
			
 
				+        exclude:
			
 
				+          - "~*"                # 排除 Office 临时锁定文件
			
 
				+          - "draft_*"           # 排除草稿
			
 
				+      
			
 
				+      # 示例 D：也可以指定单个文件
			
 
				+      - path: final_report.docx
			
 
				+
			
 
				+
			
 
				+# ============================================================
			
 
				+# 字段说明
			
 
				+# ============================================================
			
 
				+#
			
 
				+# project_name  (必填) 项目名称，同名自动归档到同一项目
			
 
				+#
			
 
				+# stages        (推荐) 阶段列表，每个阶段包含：
			
 
				+#   - name      (必填) 阶段名称，如 raw / cleaned / output
			
 
				+#   - outputs   (必填) 输出规则列表，每条规则包含：
			
 
				+#     - path    (必填) 文件路径或目录路径
			
 
				+#                      以 / 结尾 → 视为目录，收集目录下的文件
			
 
				+#                      不以 / 结尾 → 视为单个文件
			
 
				+#     - pattern (可选) 文件包含规则，支持通配符或列表
			
 
				+#                      示例: "*.csv"
			
 
				+#                      示例: ["*.csv", "*.json"]
			
 
				+#     - exclude (可选) 文件排除规则，支持通配符或列表
			
 
				+#                      示例: "*.tmp"
			
 
				+#                      示例: ["*.log", ".DS_Store"]
			
 
				+#
			
 
				+# ============================================================
			
 
				+# 工作流程
			
 
				+# ============================================================
			
 
				+#
			
 
				+# 1. 开发者在仓库根目录放置 manifest.yaml
			
 
				+# 2. 开发者 git push 代码
			
 
				+# 3. Gogs Webhook 触发 → Data Nexus 收到推送通知
			
 
				+# 4. Data Nexus 读取该 commit 的 manifest.yaml
			
 
				+# 5. 根据 stages 配置，自动提取对应的产出文件
			
 
				+# 6. 文件上传到 OSS，元数据记录到数据库
			
 
				+# 7. 通过 Web 页面即可浏览和下载
			
 
				+#
			
 
				+# 注意：如果某次 push 没有改变任何数据文件（仅改了代码），
			
 
				+#       Data Nexus 会自动识别并跳过，不会创建空的版本记录。
			
 
				+# ============================================================
			
--- a/使用指南.md
+++ b/使用指南.md
@@ -101,7 +101,8 @@ stages:
 
				 | `stages[].name` | ✅ | 环节名称，如 selection、cleaning、analysis |
			
 
				 | `stages[].outputs` | ✅ | 要上传的文件/目录列表 |
			
 
				 | `outputs[].path` | ✅ | 文件或目录路径（相对于项目根目录） |
			
 
				-| `outputs[].pattern` | ❌ | 文件匹配模式，默认 `*`（匹配所有） |
			
 
				+| `outputs[].pattern` | ❌ | 文件包含模式，默认 `*`。支持单个字符串或列表 |
			
 
				+| `outputs[].exclude` | ❌ | 文件排除模式，默认无。支持单个字符串或列表 |
			
 
				 
			
 
				 ### path 写法
			
 
				 
			
@@ -115,6 +116,11 @@ stages:
 
				 # 带匹配模式的目录
			
 
				 - path: "./images/"
			
 
				   pattern: "*.png"          # 只匹配 png 文件
			
 
				+
			
 
				+# 多模式与排除（升级功能）
			
 
				+- path: "./data/"
			
 
				+  pattern: ["*.csv", "*.xlsx"]  # 包含 csv 和 xlsx
			
 
				+  exclude: ["temp_*", "*.tmp"]  # 排除临时文件
			
 
				 ```
			
 
				 
			
 
				 ---