Jelajahi Sumber

feat:支持指定排除文件

tanjingyu 1 Minggu lalu
induk
melakukan
d2c1de3774
3 mengubah file dengan 135 tambahan dan 14 penghapusan
  1. 51 13
      app/services/webhook_service.py
  2. 77 0
      manifest.yaml.example
  3. 7 1
      使用指南.md

+ 51 - 13
app/services/webhook_service.py

@@ -136,7 +136,9 @@ class WebhookService:
 
         for output in outputs:
             raw_path_pattern = output.get("path", "")
-            match_pattern = output.get("pattern", "*")
+            # Support both string and list for pattern and exclude
+            patterns = output.get("pattern", "*")
+            excludes = output.get("exclude")
 
             path_pattern = normalize_path(raw_path_pattern)
             is_dir = is_directory_pattern(raw_path_pattern)
@@ -144,15 +146,21 @@ class WebhookService:
             if is_dir:
                 # Directory pattern: fetch only this directory's files
                 dir_path = path_pattern.rstrip("/")
-                logger.info(f"Fetching directory: {dir_path} with pattern: {match_pattern}")
+                logger.info(f"Fetching directory: {dir_path} with patterns: {patterns}, excludes: {excludes}")
 
                 files = await self.gogs.get_directory_tree(owner, repo_name, commit_id, dir_path)
 
                 for file_info in files:
                     file_path = file_info.get("path")
-                    # Apply pattern matching
-                    rel_name = file_path[len(dir_path) + 1:] if file_path.startswith(dir_path + "/") else file_path
-                    if fnmatch.fnmatch(rel_name, match_pattern):
+                    # Calculate name relative to the watched directory
+                    # e.g. dir_path="a", file_path="a/b.txt" -> rel_name="b.txt"
+                    rel_name = (
+                        file_path[len(dir_path) + 1 :]
+                        if file_path.startswith(dir_path + "/")
+                        else file_path
+                    )
+
+                    if self._match_patterns(rel_name, patterns, excludes):
                         try:
                             changed = await self.storage.process_file_with_sha(
                                 version, file_path, file_info.get("sha"), owner, repo_name
@@ -167,15 +175,45 @@ class WebhookService:
 
                 file_info = await self.gogs.get_file_info(owner, repo_name, commit_id, path_pattern)
                 if file_info:
-                    try:
-                        changed = await self.storage.process_file_with_sha(
-                            version, path_pattern, file_info.get("sha"), owner, repo_name
-                        )
-                        if changed:
-                            has_changes = True
-                    except Exception as e:
-                        logger.error(f"Failed to process file {path_pattern}: {e}")
+                    # Apply pattern matching to the filename for consistency
+                    import os
+                    filename = os.path.basename(path_pattern)
+                    if self._match_patterns(filename, patterns, excludes):
+                        try:
+                            changed = await self.storage.process_file_with_sha(
+                                version, path_pattern, file_info.get("sha"), owner, repo_name
+                            )
+                            if changed:
+                                has_changes = True
+                        except Exception as e:
+                            logger.error(f"Failed to process file {path_pattern}: {e}")
                 else:
                     logger.warning(f"File not found: {path_pattern}")
 
         return has_changes
+
+    def _match_patterns(
+        self,
+        filename: str,
+        include_patterns: str | list[str],
+        exclude_patterns: str | list[str] | None = None,
+    ) -> bool:
+        """Helper to match filename against multiple include and exclude glob patterns."""
+        # Normalize to lists
+        includes = (
+            [include_patterns] if isinstance(include_patterns, str) else include_patterns
+        )
+        excludes = []
+        if exclude_patterns:
+            excludes = (
+                [exclude_patterns] if isinstance(exclude_patterns, str) else exclude_patterns
+            )
+
+        # 1. Check if it matches ANY include pattern (OR logic)
+        is_included = any(fnmatch.fnmatch(filename, p) for p in includes)
+        if not is_included:
+            return False
+
+        # 2. Check if it matches ANY exclude pattern (OR logic: any match means reject)
+        is_excluded = any(fnmatch.fnmatch(filename, p) for p in excludes)
+        return not is_excluded

+ 77 - 0
manifest.yaml.example

@@ -0,0 +1,77 @@
+# ============================================================
+# Data Nexus - manifest.yaml 示例
+# 将此文件重命名为 manifest.yaml 放到你的 Git 仓库根目录
+# Data Nexus 会在每次 push 时自动读取并处理
+# ============================================================
+
+# 项目名称(必填)
+# 同一个 project_name 的数据会归档在同一个项目下
+project_name: what-创作
+
+# 每个 stage 代表数据流水线中的一个环节
+stages:
+  # ---------- 阶段 1:原始数据 ----------
+  - name: raw
+    outputs:
+      # 示例 A:收集目录下的所有文件(默认方式)
+      - path: data/raw/
+        pattern: "*"
+
+  # ---------- 阶段 2:清洗后数据 ----------
+  - name: cleaned
+    outputs:
+      # 示例 B:收集特定类型文件,并排除临时文件
+      - path: data/cleaned/
+        pattern: "*.csv"        # 只收集 CSV
+        exclude: "*.tmp"        # 排除 .tmp 结尾的中间文件
+
+  # ---------- 阶段 3:多模式匹配与复杂过滤 ----------
+  - name: output
+    outputs:
+      # 示例 C:使用列表指定多个包含模式和多个排除模式
+      - path: results/
+        pattern: 
+          - "*.pdf"
+          - "*.xlsx"
+        exclude:
+          - "~*"                # 排除 Office 临时锁定文件
+          - "draft_*"           # 排除草稿
+      
+      # 示例 D:也可以指定单个文件
+      - path: final_report.docx
+
+
+# ============================================================
+# 字段说明
+# ============================================================
+#
+# project_name  (必填) 项目名称,同名自动归档到同一项目
+#
+# stages        (推荐) 阶段列表,每个阶段包含:
+#   - name      (必填) 阶段名称,如 raw / cleaned / output
+#   - outputs   (必填) 输出规则列表,每条规则包含:
+#     - path    (必填) 文件路径或目录路径
+#                      以 / 结尾 → 视为目录,收集目录下的文件
+#                      不以 / 结尾 → 视为单个文件
+#     - pattern (可选) 文件包含规则,支持通配符或列表
+#                      示例: "*.csv"
+#                      示例: ["*.csv", "*.json"]
+#     - exclude (可选) 文件排除规则,支持通配符或列表
+#                      示例: "*.tmp"
+#                      示例: ["*.log", ".DS_Store"]
+#
+# ============================================================
+# 工作流程
+# ============================================================
+#
+# 1. 开发者在仓库根目录放置 manifest.yaml
+# 2. 开发者 git push 代码
+# 3. Gogs Webhook 触发 → Data Nexus 收到推送通知
+# 4. Data Nexus 读取该 commit 的 manifest.yaml
+# 5. 根据 stages 配置,自动提取对应的产出文件
+# 6. 文件上传到 OSS,元数据记录到数据库
+# 7. 通过 Web 页面即可浏览和下载
+#
+# 注意:如果某次 push 没有改变任何数据文件(仅改了代码),
+#       Data Nexus 会自动识别并跳过,不会创建空的版本记录。
+# ============================================================

+ 7 - 1
使用指南.md

@@ -101,7 +101,8 @@ stages:
 | `stages[].name` | ✅ | 环节名称,如 selection、cleaning、analysis |
 | `stages[].outputs` | ✅ | 要上传的文件/目录列表 |
 | `outputs[].path` | ✅ | 文件或目录路径(相对于项目根目录) |
-| `outputs[].pattern` | ❌ | 文件匹配模式,默认 `*`(匹配所有) |
+| `outputs[].pattern` | ❌ | 文件包含模式,默认 `*`。支持单个字符串或列表 |
+| `outputs[].exclude` | ❌ | 文件排除模式,默认无。支持单个字符串或列表 |
 
 ### path 写法
 
@@ -115,6 +116,11 @@ stages:
 # 带匹配模式的目录
 - path: "./images/"
   pattern: "*.png"          # 只匹配 png 文件
+
+# 多模式与排除(升级功能)
+- path: "./data/"
+  pattern: ["*.csv", "*.xlsx"]  # 包含 csv 和 xlsx
+  exclude: ["temp_*", "*.tmp"]  # 排除临时文件
 ```
 
 ---