path_config.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 路径配置管理工具
  5. 提供统一的路径管理,支持多账号批量处理
  6. """
  7. import json
  8. from pathlib import Path
  9. from typing import Dict, Optional, List
  10. import os
  11. class PathConfig:
  12. """路径配置管理类"""
  13. def __init__(self, account_name: Optional[str] = None, output_version: Optional[str] = None):
  14. """
  15. 初始化路径配置
  16. Args:
  17. account_name: 账号名称,如果不指定则使用默认账号或环境变量
  18. output_version: 输出版本,如果不指定则使用项目根目录名称
  19. """
  20. # 获取项目根目录
  21. self.project_root = Path(__file__).parent.parent.parent
  22. self.config_file = self.project_root / "config" / "accounts.json"
  23. # 加载配置
  24. self._load_config()
  25. # 获取数据根目录
  26. self.data_root = self._get_data_root()
  27. # 确定账号名称
  28. self.account_name = self._determine_account_name(account_name)
  29. # 确定输出版本(默认使用项目根目录名)
  30. self.output_version = self._determine_output_version(output_version)
  31. # 构建路径
  32. account_base = self.config["paths"]["account_base"]
  33. self.account_dir = self.data_root / account_base / self.account_name
  34. def _load_config(self):
  35. """加载配置文件"""
  36. if not self.config_file.exists():
  37. raise FileNotFoundError(f"配置文件不存在: {self.config_file}")
  38. with open(self.config_file, "r", encoding="utf-8") as f:
  39. self.config = json.load(f)
  40. def _get_account_config(self, account_name: str) -> Optional[Dict]:
  41. """获取特定账号的配置"""
  42. accounts = self.config.get("accounts", [])
  43. for acc in accounts:
  44. if acc["name"] == account_name:
  45. return acc
  46. return None
  47. def _get_data_root(self) -> Path:
  48. """
  49. 获取数据根目录
  50. 优先级:
  51. 1. 环境变量 DATA_ROOT
  52. 2. 配置文件 data_root
  53. 3. 默认值 project_root/data(向后兼容)
  54. """
  55. # 1. 环境变量
  56. data_root = os.environ.get("DATA_ROOT")
  57. if data_root:
  58. return Path(os.path.expanduser(data_root))
  59. # 2. 配置文件
  60. data_root_config = self.config.get("data_root")
  61. if data_root_config:
  62. # 支持 ~ 和环境变量
  63. expanded = os.path.expandvars(os.path.expanduser(data_root_config))
  64. path = Path(expanded)
  65. if path.is_absolute():
  66. return path
  67. else:
  68. return self.project_root / path
  69. # 3. 默认值(向后兼容)
  70. return self.project_root / "data"
  71. def _determine_account_name(self, account_name: Optional[str]) -> str:
  72. """
  73. 确定要使用的账号名称
  74. 优先级:
  75. 1. 函数参数指定的账号名
  76. 2. 环境变量 ACCOUNT_NAME
  77. 3. 配置文件中的默认账号
  78. Args:
  79. account_name: 参数指定的账号名
  80. Returns:
  81. 最终确定的账号名称
  82. """
  83. # 1. 参数指定
  84. if account_name:
  85. return account_name
  86. # 2. 环境变量
  87. env_account = os.environ.get("ACCOUNT_NAME")
  88. if env_account:
  89. return env_account
  90. # 3. 配置文件默认值
  91. default_account = self.config.get("default_account")
  92. if default_account:
  93. return default_account
  94. # 4. 如果都没有,抛出错误
  95. raise ValueError(
  96. "未指定账号名称!请通过以下方式之一指定:\n"
  97. "1. 参数: PathConfig(account_name='账号名')\n"
  98. "2. 环境变量: export ACCOUNT_NAME='账号名'\n"
  99. "3. 配置文件: 在 config/accounts.json 中设置 default_account"
  100. )
  101. def _determine_output_version(self, output_version: Optional[str]) -> str:
  102. """
  103. 确定输出版本
  104. 优先级:
  105. 1. 函数参数
  106. 2. 环境变量 OUTPUT_VERSION
  107. 3. 配置文件中的 output_version
  108. 4. 项目根目录名称(默认)
  109. """
  110. # 1. 参数指定
  111. if output_version:
  112. return output_version
  113. # 2. 环境变量
  114. env_version = os.environ.get("OUTPUT_VERSION")
  115. if env_version:
  116. return env_version
  117. # 3. 配置文件指定
  118. config_version = self.config.get("output_version")
  119. if config_version:
  120. return config_version
  121. # 4. 使用项目根目录名称(默认)
  122. project_dir_name = self.project_root.name
  123. return project_dir_name
  124. def _replace_version_var(self, path_template: str) -> str:
  125. """替换路径模板中的 {version} 变量"""
  126. return path_template.replace("{version}", self.output_version)
  127. def get_enabled_accounts(self) -> List[str]:
  128. """获取所有启用的账号列表"""
  129. accounts = self.config.get("accounts", [])
  130. return [acc["name"] for acc in accounts if acc.get("enabled", True)]
  131. def get_all_accounts(self) -> List[str]:
  132. """获取所有账号列表(包括未启用的)"""
  133. accounts = self.config.get("accounts", [])
  134. return [acc["name"] for acc in accounts]
  135. @property
  136. def filter_mode(self) -> str:
  137. """
  138. 获取过滤模式
  139. Returns:
  140. 过滤模式名称:
  141. - "exclude_current_posts": 过滤当前帖子ID(默认,推荐)
  142. - "time_based": 基于时间过滤
  143. - "none": 不过滤
  144. """
  145. return self.config.get("filter_mode", "exclude_current_posts")
  146. # ===== 输入路径 =====
  147. def _get_input_path(self, path_key: str) -> str:
  148. """
  149. 获取输入路径配置,支持账号级别的自定义路径
  150. 优先级:
  151. 1. 账号特定配置 (accounts[x].paths.input.path_key)
  152. 2. 全局默认配置 (paths.input.path_key)
  153. """
  154. # 1. 检查账号特定配置
  155. account_config = self._get_account_config(self.account_name)
  156. if account_config and "paths" in account_config:
  157. account_paths = account_config["paths"]
  158. if "input" in account_paths and path_key in account_paths["input"]:
  159. return account_paths["input"][path_key]
  160. # 2. 使用全局默认配置
  161. return self.config["paths"]["input"][path_key]
  162. @property
  163. def current_posts_dir(self) -> Path:
  164. """当前帖子what解构结果目录"""
  165. rel_path = self._get_input_path("current_posts")
  166. return self.account_dir / rel_path
  167. @property
  168. def historical_posts_dir(self) -> Path:
  169. """过去帖子what解构结果目录"""
  170. rel_path = self._get_input_path("historical_posts")
  171. return self.account_dir / rel_path
  172. @property
  173. def pattern_cluster_file(self) -> Path:
  174. """pattern聚合结果文件"""
  175. rel_path = self._get_input_path("pattern_cluster")
  176. return self.account_dir / rel_path
  177. # ===== 输出路径 =====
  178. @property
  179. def intermediate_dir(self) -> Path:
  180. """中间结果目录"""
  181. rel_path = self.config["paths"]["output"]["intermediate"]
  182. rel_path = self._replace_version_var(rel_path)
  183. return self.account_dir / rel_path
  184. @property
  185. def feature_category_mapping_file(self) -> Path:
  186. """特征名称_分类映射.json"""
  187. return self.intermediate_dir / "特征名称_分类映射.json"
  188. @property
  189. def category_hierarchy_file(self) -> Path:
  190. """分类层级映射.json"""
  191. return self.intermediate_dir / "分类层级映射.json"
  192. @property
  193. def feature_source_mapping_file(self) -> Path:
  194. """特征名称_帖子来源.json"""
  195. return self.intermediate_dir / "特征名称_帖子来源.json"
  196. @property
  197. def task_list_file(self) -> Path:
  198. """当前帖子_解构任务列表.json"""
  199. return self.intermediate_dir / "当前帖子_解构任务列表.json"
  200. @property
  201. def how_results_dir(self) -> Path:
  202. """how解构结果目录"""
  203. rel_path = self.config["paths"]["output"]["how_results"]
  204. rel_path = self._replace_version_var(rel_path)
  205. return self.account_dir / rel_path
  206. @property
  207. def visualization_dir(self) -> Path:
  208. """可视化结果目录"""
  209. rel_path = self.config["paths"]["output"]["visualization"]
  210. rel_path = self._replace_version_var(rel_path)
  211. return self.account_dir / rel_path
  212. @property
  213. def visualization_file(self) -> Path:
  214. """可视化HTML文件"""
  215. return self.visualization_dir / "how解构结果_可视化.html"
  216. # ===== 工具方法 =====
  217. def ensure_dirs(self):
  218. """确保所有输出目录存在"""
  219. self.intermediate_dir.mkdir(parents=True, exist_ok=True)
  220. self.how_results_dir.mkdir(parents=True, exist_ok=True)
  221. self.visualization_dir.mkdir(parents=True, exist_ok=True)
  222. def validate_input_paths(self) -> Dict[str, bool]:
  223. """
  224. 验证输入路径是否存在
  225. Returns:
  226. 验证结果字典
  227. """
  228. results = {
  229. "当前帖子目录": self.current_posts_dir.exists(),
  230. "过去帖子目录": self.historical_posts_dir.exists(),
  231. "pattern聚合文件": self.pattern_cluster_file.exists(),
  232. }
  233. return results
  234. def print_paths(self):
  235. """打印所有路径信息(用于调试)"""
  236. print("="*60)
  237. print(f"项目根目录: {self.project_root}")
  238. print(f"项目名称: {self.project_root.name}")
  239. print(f"数据根目录: {self.data_root}")
  240. print(f"输出版本: {self.output_version}")
  241. print(f"账号: {self.account_name}")
  242. print(f"过滤模式: {self.filter_mode}")
  243. print(f"账号根目录: {self.account_dir}")
  244. print("\n输入路径:")
  245. print(f" 当前帖子目录: {self.current_posts_dir}")
  246. print(f" 过去帖子目录: {self.historical_posts_dir}")
  247. print(f" pattern聚合文件: {self.pattern_cluster_file}")
  248. print("\n输出路径:")
  249. print(f" 中间结果目录: {self.intermediate_dir}")
  250. print(f" how解构结果目录: {self.how_results_dir}")
  251. print(f" 可视化结果目录: {self.visualization_dir}")
  252. print("="*60)
  253. def check_and_print_status(self):
  254. """检查并打印路径状态"""
  255. self.print_paths()
  256. print("\n输入路径验证:")
  257. validation = self.validate_input_paths()
  258. for name, exists in validation.items():
  259. status = "✓ 存在" if exists else "✗ 不存在"
  260. print(f" {name}: {status}")
  261. if not all(validation.values()):
  262. print("\n⚠️ 警告: 部分输入路径不存在!")
  263. return False
  264. else:
  265. print("\n✓ 所有输入路径验证通过")
  266. return True
  267. def get_path_config(account_name: Optional[str] = None) -> PathConfig:
  268. """
  269. 获取路径配置对象(便捷函数)
  270. Args:
  271. account_name: 账号名称,可选
  272. Returns:
  273. PathConfig对象
  274. """
  275. return PathConfig(account_name)
  276. if __name__ == "__main__":
  277. # 测试代码
  278. import sys
  279. account = sys.argv[1] if len(sys.argv) > 1 else None
  280. try:
  281. config = PathConfig(account)
  282. config.check_and_print_status()
  283. print("\n所有启用的账号:")
  284. for acc in config.get_enabled_accounts():
  285. print(f" - {acc}")
  286. except Exception as e:
  287. print(f"错误: {e}")
  288. sys.exit(1)