|
|
@@ -1,6 +1,9 @@
|
|
|
import yaml
|
|
|
import logging
|
|
|
import fnmatch
|
|
|
+import os
|
|
|
+import asyncio
|
|
|
+import re
|
|
|
from sqlalchemy.orm import Session
|
|
|
from app.models import Project, DataVersion
|
|
|
from app.services.gogs_client import GogsClient
|
|
|
@@ -131,9 +134,9 @@ class WebhookService:
|
|
|
|
|
|
logger.info(f"Processing stage '{stage_name}' with {len(outputs)} output rules")
|
|
|
|
|
|
- # Process outputs and check if any file actually changed
|
|
|
+ # Process ONLY changed files that match output rules (no directory tree fetching)
|
|
|
has_new_uploads = await self._process_outputs(
|
|
|
- version, outputs, owner, repo_name, after_sha
|
|
|
+ version, outputs, owner, repo_name, after_sha, all_changed_files
|
|
|
)
|
|
|
|
|
|
# Check if this version represents a real change (content OR file set)
|
|
|
@@ -183,107 +186,136 @@ class WebhookService:
|
|
|
return True
|
|
|
return False
|
|
|
|
|
|
- async def _process_outputs(
|
|
|
- self, version, outputs: list, owner: str, repo_name: str, commit_id: str
|
|
|
- ) -> bool:
|
|
|
- """Process output rules, create snapshot records for ALL matching files.
|
|
|
+ def _find_matching_output(self, file_path: str, outputs: list) -> dict | None:
|
|
|
+ """Check if a file path matches any manifest output rule using LOCAL logic only.
|
|
|
|
|
|
- Returns
|
|
|
- -------
|
|
|
- bool
|
|
|
- ``True`` if at least one file had actual content changes,
|
|
|
- ``False`` if every file was unchanged.
|
|
|
+ No Gogs API calls are made — this is pure string/glob matching.
|
|
|
+ Returns the matching output config dict, or None.
|
|
|
"""
|
|
|
- has_changes = False
|
|
|
-
|
|
|
for output in outputs:
|
|
|
- raw_path_pattern = output.get("path", "")
|
|
|
- # Support both string and list for pattern and exclude
|
|
|
+ raw_path = output.get("path", "")
|
|
|
+ path_pattern = normalize_path(raw_path)
|
|
|
+ is_dir = is_directory_pattern(raw_path)
|
|
|
patterns = output.get("pattern", "*")
|
|
|
excludes = output.get("exclude")
|
|
|
|
|
|
- direction = output.get("direction")
|
|
|
- label = output.get("label")
|
|
|
- extract_json_key = output.get("extract_json_key")
|
|
|
- directory_depth = output.get("directory_depth")
|
|
|
-
|
|
|
- path_pattern = normalize_path(raw_path_pattern)
|
|
|
- is_dir = is_directory_pattern(raw_path_pattern)
|
|
|
- dir_path = path_pattern.rstrip("/")
|
|
|
-
|
|
|
if is_dir:
|
|
|
- # Directory pattern: fetch files from the closest static parent directory
|
|
|
- # For `data/*/test/`, that is `data/`
|
|
|
- import re
|
|
|
-
|
|
|
- # Split by first wildcard chunk path
|
|
|
- wildcard_idx = dir_path.find('*')
|
|
|
- if wildcard_idx != -1:
|
|
|
- static_base = dir_path[:wildcard_idx]
|
|
|
- # Trim back to the nearest directory separator
|
|
|
- last_sep = static_base.rfind('/')
|
|
|
- if last_sep != -1:
|
|
|
- static_base = static_base[:last_sep]
|
|
|
- else:
|
|
|
- static_base = "" # ROOT
|
|
|
+ dir_path = path_pattern.rstrip("/")
|
|
|
+ if '*' in dir_path:
|
|
|
+ if not fnmatch.fnmatch(file_path, dir_path + "/*") and not fnmatch.fnmatch(file_path, dir_path):
|
|
|
+ continue
|
|
|
else:
|
|
|
- static_base = dir_path
|
|
|
-
|
|
|
- static_base = static_base.strip('/')
|
|
|
-
|
|
|
- logger.info(f"Fetching directory: {static_base} (to match wildcard path: {dir_path}) with patterns: {patterns}, excludes: {excludes}")
|
|
|
+ if not file_path.startswith(dir_path + "/"):
|
|
|
+ continue
|
|
|
+ filename = os.path.basename(file_path)
|
|
|
+ if self._match_patterns(filename, patterns, excludes):
|
|
|
+ return output
|
|
|
+ else:
|
|
|
+ if file_path == path_pattern:
|
|
|
+ return output
|
|
|
|
|
|
- files = await self.gogs.get_directory_tree(owner, repo_name, commit_id, static_base)
|
|
|
+ return None
|
|
|
|
|
|
- for file_info in files:
|
|
|
- file_path = file_info.get("path")
|
|
|
-
|
|
|
- # 1. First verify if the full path matches the wildcard directory path provided
|
|
|
- if '*' in dir_path:
|
|
|
- # e.g dir_path: data/*/test/ -> match: data/*/test/*
|
|
|
- if not fnmatch.fnmatch(file_path, dir_path + "/*") and not fnmatch.fnmatch(file_path, dir_path):
|
|
|
- continue
|
|
|
- else:
|
|
|
- if not file_path.startswith(dir_path + "/"):
|
|
|
- continue
|
|
|
-
|
|
|
- # Calculate name relative to the matched base path segment for pattern matching
|
|
|
- import os
|
|
|
- rel_name = os.path.basename(file_path)
|
|
|
-
|
|
|
- if self._match_patterns(rel_name, patterns, excludes):
|
|
|
- try:
|
|
|
- changed = await self.storage.process_file_with_sha(
|
|
|
- version, file_path, file_info.get("sha"), owner, repo_name,
|
|
|
- direction=direction, label=label, extract_json_key=extract_json_key,
|
|
|
- directory_depth=directory_depth
|
|
|
- )
|
|
|
- if changed:
|
|
|
- has_changes = True
|
|
|
- except Exception as e:
|
|
|
- logger.error(f"Failed to process file {file_path}: {e}")
|
|
|
+ async def _fetch_and_process_file(
|
|
|
+ self, version, file_path: str, output_config: dict,
|
|
|
+ owner: str, repo_name: str, commit_id: str
|
|
|
+ ) -> bool:
|
|
|
+ """Get file SHA from Gogs and process a single changed file, plus its paired input if configured."""
|
|
|
+ file_info = await self.gogs.get_file_info(owner, repo_name, commit_id, file_path)
|
|
|
+ if not file_info:
|
|
|
+ logger.info(f"File {file_path} not found at commit {commit_id[:8]} (removed). Skipping.")
|
|
|
+ return False
|
|
|
+
|
|
|
+ # Calculate group_key here so both paired input and output can share it
|
|
|
+ directory_depth = output_config.get("directory_depth")
|
|
|
+ if directory_depth is not None and directory_depth > 0:
|
|
|
+ parts = file_path.split("/")
|
|
|
+ if len(parts) > 1:
|
|
|
+ group_key = "/".join(parts[:-1][:directory_depth])
|
|
|
else:
|
|
|
- # Single file: fetch only this file's info
|
|
|
- logger.info(f"Fetching single file: {path_pattern}")
|
|
|
-
|
|
|
- file_info = await self.gogs.get_file_info(owner, repo_name, commit_id, path_pattern)
|
|
|
- if file_info:
|
|
|
- # Apply pattern matching to the filename for consistency
|
|
|
- import os
|
|
|
- filename = os.path.basename(path_pattern)
|
|
|
- if self._match_patterns(filename, patterns, excludes):
|
|
|
- try:
|
|
|
- changed = await self.storage.process_file_with_sha(
|
|
|
- version, path_pattern, file_info.get("sha"), owner, repo_name,
|
|
|
- direction=direction, label=label, extract_json_key=extract_json_key,
|
|
|
- directory_depth=directory_depth
|
|
|
+ group_key = ""
|
|
|
+ else:
|
|
|
+ group_key = os.path.dirname(file_path)
|
|
|
+
|
|
|
+ has_change = await self.storage.process_file_with_sha(
|
|
|
+ version, file_path, file_info.get("sha"), owner, repo_name,
|
|
|
+ direction=output_config.get("direction"),
|
|
|
+ label=output_config.get("label"),
|
|
|
+ extract_json_key=output_config.get("extract_json_key"),
|
|
|
+ directory_depth=directory_depth,
|
|
|
+ group_key=group_key,
|
|
|
+ )
|
|
|
+
|
|
|
+ # Handle paired_input active fetching
|
|
|
+ paired_input = output_config.get("paired_input")
|
|
|
+ if paired_input:
|
|
|
+ extract_regex = paired_input.get("extract_regex")
|
|
|
+ path_template = paired_input.get("path_template")
|
|
|
+ if extract_regex and path_template:
|
|
|
+ match = re.search(extract_regex, file_path)
|
|
|
+ if match:
|
|
|
+ # Construct paired file path using named capture groups
|
|
|
+ try:
|
|
|
+ paired_path = path_template.format(**match.groupdict())
|
|
|
+ except KeyError as e:
|
|
|
+ logger.error(f"Failed to format paired_path: missing {e} in regex match for {file_path}")
|
|
|
+ paired_path = None
|
|
|
+
|
|
|
+ if paired_path:
|
|
|
+ # Actively fetch paired file info from Gogs
|
|
|
+ paired_info = await self.gogs.get_file_info(owner, repo_name, commit_id, paired_path)
|
|
|
+ if paired_info:
|
|
|
+ paired_changed = await self.storage.process_file_with_sha(
|
|
|
+ version, paired_path, paired_info.get("sha"), owner, repo_name,
|
|
|
+ direction=paired_input.get("direction", "input"),
|
|
|
+ label=paired_input.get("label"),
|
|
|
+ extract_json_key=paired_input.get("extract_json_key"),
|
|
|
+ group_key=group_key, # Link them together!
|
|
|
)
|
|
|
- if changed:
|
|
|
- has_changes = True
|
|
|
- except Exception as e:
|
|
|
- logger.error(f"Failed to process file {path_pattern}: {e}")
|
|
|
- else:
|
|
|
- logger.warning(f"File not found: {path_pattern}")
|
|
|
+ has_change = has_change or paired_changed
|
|
|
+ else:
|
|
|
+ logger.warning(f"Paired input file not found at commit {commit_id[:8]}: {paired_path}")
|
|
|
+
|
|
|
+ return has_change
|
|
|
+
|
|
|
+ async def _process_outputs(
|
|
|
+ self, version, outputs: list, owner: str, repo_name: str, commit_id: str,
|
|
|
+ changed_files: set[str]
|
|
|
+ ) -> bool:
|
|
|
+ """Process ONLY changed files that match manifest output rules.
|
|
|
+
|
|
|
+ Instead of fetching entire directory trees from Gogs API (slow),
|
|
|
+ we match the webhook payload's changed-file list against manifest
|
|
|
+ rules using LOCAL string/glob logic — zero API calls for matching.
|
|
|
+
|
|
|
+ Returns True if at least one file had actual content changes.
|
|
|
+ """
|
|
|
+ # Step 1: Local matching — zero API calls
|
|
|
+ matched_files = []
|
|
|
+ for file_path in changed_files:
|
|
|
+ matched_output = self._find_matching_output(file_path, outputs)
|
|
|
+ if matched_output is not None:
|
|
|
+ matched_files.append((file_path, matched_output))
|
|
|
+
|
|
|
+ if not matched_files:
|
|
|
+ logger.info("No changed files matched any output rule.")
|
|
|
+ return False
|
|
|
+
|
|
|
+ logger.info(f"Matched {len(matched_files)} changed file(s) to output rules. Processing in parallel.")
|
|
|
+
|
|
|
+ # Step 2: Fetch file info + download/upload in parallel
|
|
|
+ tasks = [
|
|
|
+ self._fetch_and_process_file(version, fp, oc, owner, repo_name, commit_id)
|
|
|
+ for fp, oc in matched_files
|
|
|
+ ]
|
|
|
+
|
|
|
+ has_changes = False
|
|
|
+ results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
+ for i, res in enumerate(results):
|
|
|
+ if isinstance(res, Exception):
|
|
|
+ logger.error(f"Error processing {matched_files[i][0]}: {res}")
|
|
|
+ elif res is True:
|
|
|
+ has_changes = True
|
|
|
|
|
|
return has_changes
|
|
|
|