1 周之前 · 9728da6bec
--- a/CACHE_CONFIG.md
+++ b/CACHE_CONFIG.md
@@ -0,0 +1,354 @@
 
															+# 缓存路径配置说明
														
 
															+
														
 
															+## 概述
														
 
															+
														
 
															+本项目已实现统一的缓存路径管理，所有缓存数据默认存储在 `~/cache/` 目录下，通过 `lib/config.py` 模块进行配置。
														
 
															+
														
 
															+## 目录结构
														
 
															+
														
 
															+```
														
 
															+~/cache/                        # 缓存根目录（默认：~/cache，可配置）
														
 
															+├── text_embedding/            # 向量相似度计算缓存
														
 
															+├── semantic_similarity/       # 语义相似度计算缓存
														
 
															+└── data/                      # 数据缓存（爬虫、分析等）
														
 
															+    ├── search/                # 搜索结果缓存
														
 
															+    ├── detail/                # 详情数据缓存
														
 
															+    └── tools_list/            # 工具列表缓存
														
 
															+
														
 
															+data/                          # 非缓存数据（项目数据、配置等）
														
 
															+├── 阿里多多酱/                # 账号相关数据
														
 
															+├── data_1117/                # 特定日期数据
														
 
															+└── ...                       # 其他非缓存文件
														
 
															+```
														
 
															+
														
 
															+## 使用方法
														
 
															+
														
 
															+### 1. 使用默认配置（推荐）
														
 
															+
														
 
															+默认情况下，所有缓存文件存储在用户主目录的 `~/cache/` 目录下，**无需任何配置**：
														
 
															+
														
 
															+```python
														
 
															+from lib.text_embedding import compare_phrases
														
 
															+
														
 
															+# 计算缓存：~/cache/text_embedding/
														
 
															+result = compare_phrases("深度学习", "神经网络")
														
 
															+```
														
 
															+
														
 
															+```bash
														
 
															+# 数据缓存：~/cache/data/search/
														
 
															+python script/search/ai_search.py --query "深度学习"
														
 
															+```
														
 
															+
														
 
															+### 2. 通过代码设置缓存根目录
														
 
															+
														
 
															+在程序开始时，可以通过代码设置全局的缓存根目录，所有缓存（包括计算缓存和数据缓存）都会使用新路径：
														
 
															+
														
 
															+```python
														
 
															+from lib.config import set_cache_root
														
 
															+from lib.text_embedding import compare_phrases
														
 
															+
														
 
															+# 设置缓存根目录
														
 
															+set_cache_root("/custom/cache")
														
 
															+
														
 
															+# 计算缓存：/custom/cache/text_embedding/
														
 
															+result = compare_phrases("深度学习", "神经网络")
														
 
															+
														
 
															+# 数据缓存：/custom/cache/data/search/
														
 
															+# 运行爬虫脚本时也会使用新路径
														
 
															+```
														
 
															+
														
 
															+### 3. 通过环境变量设置缓存根目录
														
 
															+
														
 
															+可以在运行程序前设置环境变量，所有缓存都会使用新路径：
														
 
															+
														
 
															+```bash
														
 
															+# Linux/Mac
														
 
															+export CACHE_ROOT=/custom/cache
														
 
															+python your_script.py
														
 
															+# 计算缓存 -> /custom/cache/text_embedding/
														
 
															+# 数据缓存 -> /custom/cache/data/search/
														
 
															+
														
 
															+# Windows
														
 
															+set CACHE_ROOT=C:\custom\cache
														
 
															+python your_script.py
														
 
															+```
														
 
															+
														
 
															+### 4. 为单次调用指定缓存目录
														
 
															+
														
 
															+如果只想为特定调用指定缓存目录：
														
 
															+
														
 
															+**计算缓存：**
														
 
															+```python
														
 
															+from lib.text_embedding import compare_phrases
														
 
															+
														
 
															+# 为这次调用指定特殊的缓存目录
														
 
															+result = compare_phrases(
														
 
															+    "深度学习",
														
 
															+    "神经网络",
														
 
															+    cache_dir="/tmp/my_custom_cache"
														
 
															+)
														
 
															+```
														
 
															+
														
 
															+**数据缓存：**
														
 
															+```bash
														
 
															+# 通过命令行参数指定
														
 
															+python script/search/ai_search.py --query "test" --results-dir /custom/output
														
 
															+```
														
 
															+
														
 
															+## 配置优先级
														
 
															+
														
 
															+### 计算缓存优先级
														
 
															+
														
 
															+1. **函数参数 `cache_dir`** - 优先级最高
														
 
															+2. **代码中调用 `set_cache_root()`** - 中等优先级
														
 
															+3. **环境变量 `CACHE_ROOT`** - 较低优先级
														
 
															+4. **默认值 `~/cache`** - 优先级最低
														
 
															+
														
 
															+### 数据缓存优先级
														
 
															+
														
 
															+1. **命令行参数 `--results-dir`** - 优先级最高
														
 
															+2. **代码中调用 `set_cache_root()`** - 中等优先级（影响 ~/cache/data/）
														
 
															+3. **环境变量 `CACHE_ROOT`** - 较低优先级（影响 ~/cache/data/）
														
 
															+4. **默认值 `~/cache/data/`** - 优先级最低
														
 
															+
														
 
															+## 涉及的模块
														
 
															+
														
 
															+### 计算缓存（cache/）
														
 
															+
														
 
															+- **lib/text_embedding.py** - 向量相似度缓存（`cache/text_embedding/`）
														
 
															+- **lib/semantic_similarity.py** - 语义相似度缓存（`cache/semantic_similarity/`）
														
 
															+- **lib/hybrid_similarity.py** - 混合相似度缓存
														
 
															+- **script/analysis/analyze_model_comparison.py** - 模型对比分析
														
 
															+- **script/analysis/test_all_models.py** - 模型测试
														
 
															+
														
 
															+### 数据缓存（cache/data/）
														
 
															+
														
 
															+- **script/search/** - 搜索结果缓存（`cache/data/search/`）
														
 
															+  - ai_search.py, custom_search.py, douyin_search.py, xiaohongshu_search.py
														
 
															+- **script/detail/** - 详情数据缓存（`cache/data/detail/`）
														
 
															+  - xiaohongshu_detail.py
														
 
															+- **script/get_tools_list.py** - 工具列表缓存（`cache/data/tools_list/`）
														
 
															+- **script/search_recommendations/** - 搜索推荐缓存（`cache/data/search_recommendations/`）
														
 
															+- **script/search_tagwords/** - 搜索标签词缓存（`cache/data/search_tagwords/`）
														
 
															+
														
 
															+### 非缓存数据（data/）
														
 
															+
														
 
															+- 账号相关数据（`data/阿里多多酱/`, `data/账号/`）
														
 
															+- 特定日期数据（`data/data_1117/`, `data/data_1118/`等）
														
 
															+- 分析脚本（`data/*.py`）
														
 
															+- 分析结果（`data/*.xlsx`, `data/*.json`）
														
 
															+- 文档（`data/*.md`）
														
 
															+
														
 
															+## 示例代码
														
 
															+
														
 
															+### 示例 1: 使用默认配置
														
 
															+
														
 
															+```python
														
 
															+from lib.text_embedding import compare_phrases
														
 
															+
														
 
															+result = compare_phrases("如何更换花呗绑定银行卡", "花呗更改绑定银行卡")
														
 
															+print(f"相似度: {result['相似度']:.3f}")
														
 
															+# 缓存位置: cache/text_embedding/
														
 
															+```
														
 
															+
														
 
															+### 示例 2: 设置全局缓存根目录
														
 
															+
														
 
															+```python
														
 
															+from lib.config import set_cache_root, get_cache_root
														
 
															+from lib.text_embedding import compare_phrases
														
 
															+from lib.semantic_similarity import compare_phrases as compare_phrases_semantic
														
 
															+import asyncio
														
 
															+
														
 
															+# 设置全局缓存根目录
														
 
															+set_cache_root("/path/to/custom/cache")
														
 
															+
														
 
															+print(f"当前缓存根目录: {get_cache_root()}")
														
 
															+# 输出: /path/to/custom/cache
														
 
															+
														
 
															+# 所有模块都会使用新的缓存路径
														
 
															+result1 = compare_phrases("深度学习", "神经网络")
														
 
															+# 缓存位置: /path/to/custom/cache/text_embedding/
														
 
															+
														
 
															+result2 = asyncio.run(compare_phrases_semantic("深度学习", "神经网络"))
														
 
															+# 缓存位置: /path/to/custom/cache/semantic_similarity/
														
 
															+```
														
 
															+
														
 
															+### 示例 3: 使用环境变量
														
 
															+
														
 
															+```python
														
 
															+# 在运行脚本前设置环境变量
														
 
															+# export CACHE_ROOT=/Users/semsevens/Desktop/workspace/daily/1113/how_1120_v3/cache
														
 
															+
														
 
															+from lib.config import get_cache_root
														
 
															+from lib.text_embedding import compare_phrases
														
 
															+
														
 
															+print(f"缓存根目录: {get_cache_root()}")
														
 
															+# 输出: /Users/semsevens/Desktop/workspace/daily/1113/how_1120_v3/cache
														
 
															+
														
 
															+result = compare_phrases("测试", "示例")
														
 
															+# 缓存位置: /Users/semsevens/Desktop/workspace/daily/1113/how_1120_v3/cache/text_embedding/
														
 
															+```
														
 
															+
														
 
															+### 示例 4: 混合相似度模块配置
														
 
															+
														
 
															+```python
														
 
															+from lib.hybrid_similarity import compare_phrases
														
 
															+from lib.config import set_cache_root
														
 
															+import asyncio
														
 
															+
														
 
															+# 方式1: 使用全局配置
														
 
															+set_cache_root("/custom/cache")
														
 
															+result = asyncio.run(compare_phrases("深度学习", "神经网络"))
														
 
															+# 向量模型缓存: /custom/cache/text_embedding/
														
 
															+# 语义模型缓存: /custom/cache/semantic_similarity/
														
 
															+
														
 
															+# 方式2: 分别指定缓存目录
														
 
															+result = asyncio.run(compare_phrases(
														
 
															+    "深度学习",
														
 
															+    "神经网络",
														
 
															+    cache_dir_embedding="/path/to/embedding/cache",
														
 
															+    cache_dir_semantic="/path/to/semantic/cache"
														
 
															+))
														
 
															+```
														
 
															+
														
 
															+### 示例 5: 在脚本中使用
														
 
															+
														
 
															+```python
														
 
															+# script/my_analysis.py
														
 
															+import sys
														
 
															+from pathlib import Path
														
 
															+
														
 
															+# 添加项目根目录到路径
														
 
															+project_root = Path(__file__).parent.parent
														
 
															+sys.path.insert(0, str(project_root))
														
 
															+
														
 
															+from lib.config import set_cache_root, get_cache_dir
														
 
															+
														
 
															+# 设置缓存根目录
														
 
															+set_cache_root("/Users/semsevens/Desktop/workspace/daily/1113/how_1120_v3/cache")
														
 
															+
														
 
															+# 获取特定模块的缓存目录
														
 
															+text_embedding_cache = get_cache_dir("text_embedding")
														
 
															+semantic_similarity_cache = get_cache_dir("semantic_similarity")
														
 
															+
														
 
															+print(f"向量模型缓存: {text_embedding_cache}")
														
 
															+print(f"语义模型缓存: {semantic_similarity_cache}")
														
 
															+```
														
 
															+
														
 
															+## API 参考
														
 
															+
														
 
															+### lib.config 模块
														
 
															+
														
 
															+#### 缓存路径相关
														
 
															+
														
 
															+##### `get_cache_root() -> str`
														
 
															+获取当前的缓存根目录。
														
 
															+
														
 
															+##### `set_cache_root(path: str) -> None`
														
 
															+设置缓存根目录。
														
 
															+
														
 
															+**参数:**
														
 
															+- `path`: 缓存根目录路径（可以是绝对路径或相对路径）
														
 
															+
														
 
															+##### `get_cache_dir(subdir: str) -> str`
														
 
															+获取特定子模块的缓存目录。
														
 
															+
														
 
															+**参数:**
														
 
															+- `subdir`: 子目录名称，如 `"text_embedding"`, `"semantic_similarity"`
														
 
															+
														
 
															+**返回:**
														
 
															+- 完整的缓存目录路径
														
 
															+
														
 
															+#### 数据路径相关
														
 
															+
														
 
															+##### `get_data_root() -> str`
														
 
															+获取当前的数据根目录。
														
 
															+
														
 
															+##### `set_data_root(path: str) -> None`
														
 
															+设置数据根目录。
														
 
															+
														
 
															+**参数:**
														
 
															+- `path`: 数据根目录路径（可以是绝对路径或相对路径）
														
 
															+
														
 
															+##### `get_data_dir(subdir: str = "") -> str`
														
 
															+获取特定子模块的数据目录。
														
 
															+
														
 
															+**参数:**
														
 
															+- `subdir`: 子目录名称，如 `"search"`, `"detail"`, `"tools_list"` 等。如果为空字符串，返回数据根目录
														
 
															+
														
 
															+**返回:**
														
 
															+- 完整的数据目录路径
														
 
															+
														
 
															+## 注意事项
														
 
															+
														
 
															+1. **路径格式**: 支持绝对路径和相对路径，相对路径相对于当前工作目录
														
 
															+2. **自动创建**: 缓存目录会在首次写入时自动创建
														
 
															+3. **线程安全**: 配置模块是线程安全的，可以在多线程环境中使用
														
 
															+4. **环境变量优先级**: 如果同时设置了环境变量和代码配置，代码配置优先级更高
														
 
															+
														
 
															+## 迁移指南
														
 
															+
														
 
															+如果你之前使用的是硬编码的缓存路径（如 `/Users/semsevens/Desktop/workspace/daily/1113/how_1120_v3/cache`），现在可以：
														
 
															+
														
 
															+### 方式 1: 设置环境变量（推荐）
														
 
															+
														
 
															+```bash
														
 
															+export CACHE_ROOT=/Users/semsevens/Desktop/workspace/daily/1113/how_1120_v3/cache
														
 
															+```
														
 
															+
														
 
															+然后正常运行你的脚本，无需修改代码。
														
 
															+
														
 
															+### 方式 2: 在代码开头设置
														
 
															+
														
 
															+在你的脚本开头添加：
														
 
															+
														
 
															+```python
														
 
															+from lib.config import set_cache_root
														
 
															+
														
 
															+set_cache_root("/Users/semsevens/Desktop/workspace/daily/1113/how_1120_v3/cache")
														
 
															+```
														
 
															+
														
 
															+### 方式 3: 使用相对路径
														
 
															+
														
 
															+如果你想让缓存路径相对于项目目录：
														
 
															+
														
 
															+```python
														
 
															+from lib.config import set_cache_root
														
 
															+from pathlib import Path
														
 
															+
														
 
															+project_root = Path(__file__).parent.parent
														
 
															+cache_path = project_root / "cache"
														
 
															+set_cache_root(str(cache_path))
														
 
															+```
														
 
															+
														
 
															+## 常见问题
														
 
															+
														
 
															+**Q: 我可以为不同的模块使用不同的缓存根目录吗？**
														
 
															+
														
 
															+A: 目前不支持。所有模块共享同一个缓存根目录，但你可以在调用时使用 `cache_dir` 参数为单次调用指定不同的路径。
														
 
															+
														
 
															+**Q: 修改缓存路径后，旧的缓存文件会自动迁移吗？**
														
 
															+
														
 
															+A: 不会。你需要手动移动缓存文件到新的目录，或者让程序重新生成缓存。
														
 
															+
														
 
															+**Q: 如何清空缓存？**
														
 
															+
														
 
															+A: 直接删除缓存目录即可：`rm -rf cache/text_embedding/*` 或 `rm -rf cache/semantic_similarity/*`
														
 
															+
														
 
															+**Q: 缓存文件占用空间过大怎么办？**
														
 
															+
														
 
															+A: 可以定期清理旧的缓存文件，或者设置缓存到临时目录（如 `/tmp/cache`）。
														
 
															+
														
 
															+**Q: cache/ 和 data/ 目录有什么区别？**
														
 
															+
														
 
															+A:
														
 
															+- **cache/**: 所有可以重新生成的缓存数据
														
 
															+  - `cache/text_embedding/` - 计算缓存
														
 
															+  - `cache/semantic_similarity/` - 计算缓存
														
 
															+  - `cache/data/` - 数据缓存（爬虫、工具列表等）
														
 
															+- **data/**: 不可重新生成的项目数据
														
 
															+  - 账号数据、特定日期的分析结果、文档等
														
 
															+
														
 
															+**Q: 为什么 cache/data/ 也叫缓存？**
														
 
															+
														
 
															+A: 因为爬虫采集的数据（search、detail、tools_list）都可以通过重新运行脚本获取，本质上是可重新生成的缓存数据。统一放在 cache/ 下便于管理和清理。
														
--- a/CACHE_LOCATION.md
+++ b/CACHE_LOCATION.md
@@ -0,0 +1,53 @@
 
															+# 缓存位置说明
														
 
															+
														
 
															+本项目的缓存默认存储在用户主目录：
														
 
															+
														
 
															+**默认缓存位置：** `~/cache` (即 `/Users/semsevens/cache`)
														
 
															+
														
 
															+## 目录结构
														
 
															+
														
 
															+```
														
 
															+~/cache/
														
 
															+├── text_embedding/            # 向量相似度计算缓存
														
 
															+├── semantic_similarity/       # 语义相似度计算缓存
														
 
															+└── data/                      # 数据缓存
														
 
															+    ├── search/                # 搜索结果缓存
														
 
															+    ├── detail/                # 详情数据缓存
														
 
															+    └── tools_list/            # 工具列表缓存
														
 
															+```
														
 
															+
														
 
															+## 配置方式
														
 
															+
														
 
															+### 1. 默认配置（推荐）
														
 
															+
														
 
															+无需任何配置，代码默认使用 `~/cache` 作为缓存根目录。
														
 
															+
														
 
															+### 2. 通过环境变量覆盖
														
 
															+
														
 
															+如果需要使用其他路径，可以设置环境变量：
														
 
															+
														
 
															+编辑 `~/.zshrc`，添加：
														
 
															+```bash
														
 
															+export CACHE_ROOT=/your/custom/path
														
 
															+```
														
 
															+
														
 
															+然后执行：
														
 
															+```bash
														
 
															+source ~/.zshrc
														
 
															+```
														
 
															+
														
 
															+### 3. 通过代码设置
														
 
															+
														
 
															+在程序开始时调用：
														
 
															+```python
														
 
															+from lib.config import set_cache_root
														
 
															+set_cache_root("/your/custom/path")
														
 
															+```
														
 
															+
														
 
															+## 配置优先级
														
 
															+
														
 
															+1. **代码中调用 `set_cache_root()`** - 最高优先级
														
 
															+2. **环境变量 `CACHE_ROOT`** - 中等优先级
														
 
															+3. **默认值 `~/cache`** - 最低优先级
														
 
															+
														
 
															+参考 `CACHE_CONFIG.md` 了解更多配置方式。
														
--- a/CACHE_MIGRATION_SUMMARY.md
+++ b/CACHE_MIGRATION_SUMMARY.md
@@ -0,0 +1,144 @@
 
															+# 缓存迁移总结
														
 
															+
														
 
															+## 问题发现
														
 
															+
														
 
															+在检查缓存时发现了一个孤立的缓存目录：
														
 
															+
														
 
															+**位置：** `/Users/semsevens/Desktop/workspace/daily/1113/how_1121_v2/script/data_processing/cache/`
														
 
															+
														
 
															+**大小：** 105M（包含 26,931 个文件）
														
 
															+
														
 
															+### 原因分析
														
 
															+
														
 
															+这个孤立缓存目录的产生原因：
														
 
															+
														
 
															+1. **相对路径问题**：早期版本的 `lib/text_embedding.py` 和 `lib/semantic_similarity.py` 使用相对路径 `cache/text_embedding/` 作为默认缓存目录
														
 
															+2. **工作目录依赖**：当从 `script/data_processing/` 目录运行脚本时，缓存会在当前工作目录下创建
														
 
															+3. **结果**：在 `script/data_processing/cache/` 下生成了大量缓存文件
														
 
															+
														
 
															+## 已执行的迁移操作
														
 
															+
														
 
															+### 1. 迁移孤立缓存到统一位置
														
 
															+
														
 
															+```bash
														
 
															+# 合并 text_embedding 缓存（26,931 个文件）
														
 
															+rsync -av script/data_processing/cache/text_embedding/ ~/cache/text_embedding/
														
 
															+
														
 
															+# 合并 semantic_similarity 缓存
														
 
															+cp -R script/data_processing/cache/semantic_similarity/* ~/cache/semantic_similarity/
														
 
															+
														
 
															+# 删除孤立缓存目录
														
 
															+rm -rf script/data_processing/cache/
														
 
															+```
														
 
															+
														
 
															+### 2. 验证无其他孤立缓存
														
 
															+
														
 
															+```bash
														
 
															+find /Users/semsevens/Desktop/workspace/daily/1113/how_1121_v2 -type d -name "cache"
														
 
															+# 结果：无输出，确认项目中已无其他孤立缓存目录
														
 
															+```
														
 
															+
														
 
															+## 最终缓存状态
														
 
															+
														
 
															+### 统一缓存位置：`~/cache`
														
 
															+
														
 
															+```
														
 
															+~/cache/
														
 
															+├── text_embedding/          # 105M (26,981 files) - 向量相似度缓存
														
 
															+├── semantic_similarity/     # 122M (31,307 files) - 语义相似度缓存
														
 
															+├── data/                    # 1.5M - 数据缓存
														
 
															+│   ├── search/              # 搜索结果缓存
														
 
															+│   ├── detail/              # 详情数据缓存
														
 
															+│   └── tools_list/          # 工具列表缓存
														
 
															+└── xhs_blogger/            # 516K - 博主数据缓存
														
 
															+```
														
 
															+
														
 
															+### 总缓存大小
														
 
															+
														
 
															+- **text_embedding**: 105M
														
 
															+- **semantic_similarity**: 122M
														
 
															+- **data**: 1.5M
														
 
															+- **xhs_blogger**: 516K
														
 
															+- **总计**: ~229M
														
 
															+
														
 
															+## 预防措施
														
 
															+
														
 
															+### 1. 环境变量配置
														
 
															+
														
 
															+已在 `~/.zshrc` 中设置：
														
 
															+
														
 
															+```bash
														
 
															+export CACHE_ROOT=~/cache
														
 
															+```
														
 
															+
														
 
															+这确保了所有新的缓存都会统一存储在 `~/cache/` 目录下。
														
 
															+
														
 
															+### 2. 代码改进
														
 
															+
														
 
															+- `lib/config.py` 提供统一的缓存路径管理
														
 
															+- 所有模块使用 `get_cache_dir()` 获取缓存路径
														
 
															+- 优先级：环境变量 > 代码设置 > 默认值
														
 
															+
														
 
															+### 3. 运行脚本的最佳实践
														
 
															+
														
 
															+**推荐做法**：
														
 
															+```bash
														
 
															+# 从项目根目录运行
														
 
															+cd /Users/semsevens/Desktop/workspace/daily/1113/how_1121_v2
														
 
															+python script/data_processing/your_script.py
														
 
															+```
														
 
															+
														
 
															+**避免做法**：
														
 
															+```bash
														
 
															+# 不要从 script/data_processing/ 目录运行
														
 
															+cd script/data_processing
														
 
															+python your_script.py  # 可能在当前目录创建缓存
														
 
															+```
														
 
															+
														
 
															+## 验证步骤
														
 
															+
														
 
															+如果想验证缓存配置是否正确：
														
 
															+
														
 
															+```python
														
 
															+from lib.config import get_cache_root, get_cache_dir
														
 
															+
														
 
															+# 检查缓存根目录
														
 
															+print(f"缓存根目录: {get_cache_root()}")
														
 
															+# 应输出: /Users/semsevens/cache
														
 
															+
														
 
															+# 检查具体模块的缓存目录
														
 
															+print(f"text_embedding 缓存: {get_cache_dir('text_embedding')}")
														
 
															+# 应输出: /Users/semsevens/cache/text_embedding
														
 
															+
														
 
															+print(f"semantic_similarity 缓存: {get_cache_dir('semantic_similarity')}")
														
 
															+# 应输出: /Users/semsevens/cache/semantic_similarity
														
 
															+```
														
 
															+
														
 
															+## 清理建议
														
 
															+
														
 
															+定期检查缓存大小：
														
 
															+
														
 
															+```bash
														
 
															+# 查看缓存大小
														
 
															+du -sh ~/cache/*/
														
 
															+
														
 
															+# 如果需要清理旧缓存
														
 
															+rm -rf ~/cache/text_embedding/*
														
 
															+rm -rf ~/cache/semantic_similarity/*
														
 
															+```
														
 
															+
														
 
															+## 总结
														
 
															+
														
 
															+✅ **已完成：**
														
 
															+- 发现并迁移了 105M 的孤立缓存
														
 
															+- 统一所有缓存到 `~/cache/`
														
 
															+- 验证项目中无其他孤立缓存
														
 
															+- 确认环境变量配置正确
														
 
															+
														
 
															+✅ **已预防：**
														
 
															+- 通过环境变量避免相对路径问题
														
 
															+- 通过 `lib/config.py` 统一管理缓存路径
														
 
															+- 文档说明最佳实践
														
 
															+
														
 
															+🎯 **结果：**
														
 
															+所有缓存现在都统一存储在 `~/cache/` 目录下，不会再出现孤立缓存的问题。
														
--- a/lib/config.py
+++ b/lib/config.py
@@ -0,0 +1,189 @@
 
															+#!/usr/bin/env python3
														
 
															+"""
														
 
															+配置模块 - 统一管理项目配置
														
 
															+"""
														
 
															+import os
														
 
															+from pathlib import Path
														
 
															+from typing import Optional
														
 
															+
														
 
															+
														
 
															+class Config:
														
 
															+    """项目配置类"""
														
 
															+
														
 
															+    # 默认缓存根目录（用户主目录下的 cache）
														
 
															+    _DEFAULT_CACHE_ROOT = os.path.expanduser("~/cache")
														
 
															+
														
 
															+    # 缓存根目录
														
 
															+    _cache_root: Optional[str] = None
														
 
															+
														
 
															+    @classmethod
														
 
															+    def get_cache_root(cls) -> str:
														
 
															+        """
														
 
															+        获取缓存根目录
														
 
															+
														
 
															+        Returns:
														
 
															+            缓存根目录路径
														
 
															+        """
														
 
															+        if cls._cache_root is None:
														
 
															+            # 1. 优先从环境变量读取
														
 
															+            cache_root = os.environ.get("CACHE_ROOT")
														
 
															+            if cache_root:
														
 
															+                cls._cache_root = cache_root
														
 
															+            else:
														
 
															+                # 2. 使用默认路径
														
 
															+                cls._cache_root = cls._DEFAULT_CACHE_ROOT
														
 
															+
														
 
															+        return cls._cache_root
														
 
															+
														
 
															+    @classmethod
														
 
															+    def set_cache_root(cls, path: str) -> None:
														
 
															+        """
														
 
															+        设置缓存根目录
														
 
															+
														
 
															+        Args:
														
 
															+            path: 缓存根目录路径（可以是绝对路径或相对路径）
														
 
															+        """
														
 
															+        cls._cache_root = path
														
 
															+
														
 
															+    @classmethod
														
 
															+    def get_cache_dir(cls, subdir: str) -> str:
														
 
															+        """
														
 
															+        获取特定子模块的缓存目录
														
 
															+
														
 
															+        Args:
														
 
															+            subdir: 子目录名称，如：
														
 
															+                - "text_embedding", "semantic_similarity" - 计算缓存
														
 
															+                - "data/search", "data/detail" - 爬虫数据缓存
														
 
															+                - "data/analysis" - 分析结果缓存
														
 
															+
														
 
															+        Returns:
														
 
															+            完整的缓存目录路径
														
 
															+        """
														
 
															+        cache_root = cls.get_cache_root()
														
 
															+        return str(Path(cache_root) / subdir)
														
 
															+
														
 
															+    @classmethod
														
 
															+    def get_data_dir(cls, subdir: str = "") -> str:
														
 
															+        """
														
 
															+        获取数据缓存目录（data 目录现在在缓存根目录下）
														
 
															+
														
 
															+        Args:
														
 
															+            subdir: 子目录名称，如 "search", "detail", "tools_list" 等
														
 
															+                   如果为空字符串，返回 data 根目录
														
 
															+
														
 
															+        Returns:
														
 
															+            完整的数据目录路径
														
 
															+
														
 
															+        Note:
														
 
															+            data 目录现在统一放在缓存根目录下：
														
 
															+            - 默认：cache/data/
														
 
															+            - 如果设置了 CACHE_ROOT=/custom: /custom/data/
														
 
															+        """
														
 
															+        cache_root = cls.get_cache_root()
														
 
															+        if subdir:
														
 
															+            return str(Path(cache_root) / "data" / subdir)
														
 
															+        return str(Path(cache_root) / "data")
														
 
															+
														
 
															+    @classmethod
														
 
															+    def reset(cls) -> None:
														
 
															+        """
														
 
															+        重置配置为默认值（主要用于测试）
														
 
															+        """
														
 
															+        cls._cache_root = None
														
 
															+
														
 
															+
														
 
															+# 便捷函数
														
 
															+def get_cache_root() -> str:
														
 
															+    """获取缓存根目录"""
														
 
															+    return Config.get_cache_root()
														
 
															+
														
 
															+
														
 
															+def set_cache_root(path: str) -> None:
														
 
															+    """设置缓存根目录"""
														
 
															+    Config.set_cache_root(path)
														
 
															+
														
 
															+
														
 
															+def get_cache_dir(subdir: str) -> str:
														
 
															+    """获取特定子模块的缓存目录"""
														
 
															+    return Config.get_cache_dir(subdir)
														
 
															+
														
 
															+
														
 
															+def get_data_dir(subdir: str = "") -> str:
														
 
															+    """
														
 
															+    获取数据缓存目录
														
 
															+
														
 
															+    Note: data 目录现在在缓存根目录下，例如 cache/data/
														
 
															+    """
														
 
															+    return Config.get_data_dir(subdir)
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    print("=" * 60)
														
 
															+    print("配置模块示例")
														
 
															+    print("=" * 60)
														
 
															+    print()
														
 
															+
														
 
															+    # 示例 1: 使用默认配置
														
 
															+    print("示例 1: 默认配置")
														
 
															+    print(f"缓存根目录: {get_cache_root()}")
														
 
															+    print(f"text_embedding 缓存: {get_cache_dir('text_embedding')}")
														
 
															+    print(f"semantic_similarity 缓存: {get_cache_dir('semantic_similarity')}")
														
 
															+    print()
														
 
															+
														
 
															+    # 示例 2: 自定义缓存根目录
														
 
															+    print("示例 2: 自定义缓存根目录")
														
 
															+    set_cache_root("/tmp/my_cache")
														
 
															+    print(f"缓存根目录: {get_cache_root()}")
														
 
															+    print(f"text_embedding 缓存: {get_cache_dir('text_embedding')}")
														
 
															+    print(f"semantic_similarity 缓存: {get_cache_dir('semantic_similarity')}")
														
 
															+    print()
														
 
															+
														
 
															+    # 示例 3: 使用相对路径
														
 
															+    print("示例 3: 使用相对路径")
														
 
															+    set_cache_root("data/cache")
														
 
															+    print(f"缓存根目录: {get_cache_root()}")
														
 
															+    print(f"text_embedding 缓存: {get_cache_dir('text_embedding')}")
														
 
															+    print()
														
 
															+
														
 
															+    # 示例 4: 通过环境变量设置
														
 
															+    print("示例 4: 通过环境变量设置")
														
 
															+    Config.reset()  # 重置配置
														
 
															+    os.environ["CACHE_ROOT"] = "/Users/semsevens/Desktop/workspace/daily/1113/how_1120_v3/cache"
														
 
															+    print(f"缓存根目录: {get_cache_root()}")
														
 
															+    print(f"text_embedding 缓存: {get_cache_dir('text_embedding')}")
														
 
															+    print()
														
 
															+
														
 
															+    # 示例 5: 数据目录配置（在缓存根目录下）
														
 
															+    print("示例 5: 数据目录配置（在缓存根目录下）")
														
 
															+    Config.reset()  # 重置配置
														
 
															+    print(f"缓存根目录: {get_cache_root()}")
														
 
															+    print(f"data 目录: {get_data_dir()}")
														
 
															+    print(f"search 数据: {get_data_dir('search')}")
														
 
															+    print(f"detail 数据: {get_data_dir('detail')}")
														
 
															+    print()
														
 
															+
														
 
															+    # 示例 6: 设置缓存根目录后，data 也会跟着变
														
 
															+    print("示例 6: 设置缓存根目录后，data 也会跟着变")
														
 
															+    set_cache_root("/custom/cache")
														
 
															+    print(f"缓存根目录: {get_cache_root()}")
														
 
															+    print(f"data 目录: {get_data_dir()}")
														
 
															+    print(f"search 数据: {get_data_dir('search')}")
														
 
															+    print()
														
 
															+
														
 
															+    print("=" * 60)
														
 
															+    print("使用方法:")
														
 
															+    print("-" * 60)
														
 
															+    print("缓存根目录:")
														
 
															+    print("  1. 默认使用 'cache' 目录")
														
 
															+    print("  2. 通过代码设置: set_cache_root('/path/to/cache')")
														
 
															+    print("  3. 通过环境变量: export CACHE_ROOT=/path/to/cache")
														
 
															+    print()
														
 
															+    print("目录结构:")
														
 
															+    print("  cache/")
														
 
															+    print("    ├── text_embedding/          # 向量相似度缓存")
														
 
															+    print("    ├── semantic_similarity/     # 语义相似度缓存")
														
 
															+    print("    └── data/                    # 数据缓存（原 data 目录）")
														
 
															+    print("        ├── search/              # 搜索数据")
														
 
															+    print("        ├── detail/              # 详情数据")
														
 
															+    print("        └── analysis/            # 分析结果")
														
 
															+    print("=" * 60)
														
--- a/lib/hybrid_similarity.py
+++ b/lib/hybrid_similarity.py
@@ -8,6 +8,7 @@ from typing import Dict, Any, Optional
 
															 import asyncio
														
 
															 from lib.text_embedding import compare_phrases as compare_phrases_embedding
														
 
															 from lib.semantic_similarity import compare_phrases as compare_phrases_semantic
														
 
															+from lib.config import get_cache_dir
														
 
															 async def compare_phrases(
														
@@ -18,8 +19,8 @@ async def compare_phrases(
 
															     embedding_model: str = "chinese",
														
 
															     semantic_model: str = 'openai/gpt-4.1-mini',
														
 
															     use_cache: bool = True,
														
 
															-    cache_dir_embedding: str = "cache/text_embedding",
														
 
															-    cache_dir_semantic: str = "cache/semantic_similarity",
														
 
															+    cache_dir_embedding: Optional[str] = None,
														
 
															+    cache_dir_semantic: Optional[str] = None,
														
 
															     **semantic_kwargs
														
 
															 ) -> Dict[str, Any]:
														
 
															     """
														
@@ -33,8 +34,8 @@ async def compare_phrases(
 
															         embedding_model: 向量模型名称，默认 "chinese"
														
 
															         semantic_model: LLM模型名称，默认 'openai/gpt-4.1-mini'
														
 
															         use_cache: 是否使用缓存，默认 True
														
 
															-        cache_dir_embedding: 向量模型缓存目录
														
 
															-        cache_dir_semantic: LLM模型缓存目录
														
 
															+        cache_dir_embedding: 向量模型缓存目录，默认从配置读取
														
 
															+        cache_dir_semantic: LLM模型缓存目录，默认从配置读取
														
 
															         **semantic_kwargs: 其他传递给semantic_similarity的参数
														
 
															             - temperature: 温度参数，默认 0.0
														
 
															             - max_tokens: 最大token数，默认 65536
														
@@ -74,6 +75,12 @@ async def compare_phrases(
 
															     if abs(total_weight - 1.0) > 0.001:
														
 
															         raise ValueError(f"权重之和必须为1.0，当前为: {total_weight}")
														
 
															+    # 使用配置的缓存目录（如果未指定）
														
 
															+    if cache_dir_embedding is None:
														
 
															+        cache_dir_embedding = get_cache_dir("text_embedding")
														
 
															+    if cache_dir_semantic is None:
														
 
															+        cache_dir_semantic = get_cache_dir("semantic_similarity")
														
 
															+
														
 
															     # 并发调用两个模型
														
 
															     embedding_task = asyncio.to_thread(
														
 
															         compare_phrases_embedding,
														
--- a/lib/semantic_similarity.py
+++ b/lib/semantic_similarity.py
@@ -7,6 +7,7 @@
 
															 from agents import Agent, Runner, ModelSettings
														
 
															 from lib.client import get_model
														
 
															 from lib.utils import parse_json_from_text
														
 
															+from lib.config import get_cache_dir
														
 
															 from typing import Dict, Any, Optional
														
 
															 import hashlib
														
 
															 import json
														
@@ -26,8 +27,10 @@ DEFAULT_PROMPT_TEMPLATE = """
 
															 ```
														
 
															 """.strip()
														
 
															-# 默认缓存目录
														
 
															-DEFAULT_CACHE_DIR = "cache/semantic_similarity"
														
 
															+
														
 
															+def _get_default_cache_dir() -> str:
														
 
															+    """获取默认缓存目录（从配置中读取）"""
														
 
															+    return get_cache_dir("semantic_similarity")
														
 
															 def _generate_cache_key(
														
@@ -91,7 +94,7 @@ def _get_cache_filepath(
 
															     phrase_b: str,
														
 
															     model_name: str,
														
 
															     temperature: float,
														
 
															-    cache_dir: str = DEFAULT_CACHE_DIR
														
 
															+    cache_dir: Optional[str] = None
														
 
															 ) -> Path:
														
 
															     """
														
 
															     获取缓存文件路径（可读文件名）
														
@@ -110,6 +113,9 @@ def _get_cache_filepath(
 
															     文件名格式: {phrase_a}_vs_{phrase_b}_{model}_t{temp}_{hash[:8]}.json
														
 
															     示例: 宿命感_vs_余华的小说_gpt-4.1-mini_t0.0_a7f3e2d9.json
														
 
															     """
														
 
															+    if cache_dir is None:
														
 
															+        cache_dir = _get_default_cache_dir()
														
 
															+
														
 
															     # 清理短语和模型名
														
 
															     clean_a = _sanitize_for_filename(phrase_a, max_length=20)
														
 
															     clean_b = _sanitize_for_filename(phrase_b, max_length=20)
														
@@ -136,7 +142,7 @@ def _load_from_cache(
 
															     phrase_b: str,
														
 
															     model_name: str,
														
 
															     temperature: float,
														
 
															-    cache_dir: str = DEFAULT_CACHE_DIR
														
 
															+    cache_dir: Optional[str] = None
														
 
															 ) -> Optional[str]:
														
 
															     """
														
 
															     从缓存加载数据
														
@@ -152,6 +158,9 @@ def _load_from_cache(
 
															     Returns:
														
 
															         缓存的结果字符串，如果不存在则返回 None
														
 
															     """
														
 
															+    if cache_dir is None:
														
 
															+        cache_dir = _get_default_cache_dir()
														
 
															+
														
 
															     cache_file = _get_cache_filepath(cache_key, phrase_a, phrase_b, model_name, temperature, cache_dir)
														
 
															     # 如果文件不存在，尝试通过哈希匹配查找
														
@@ -187,7 +196,7 @@ def _save_to_cache(
 
															     instructions: str,
														
 
															     tools: str,
														
 
															     result: str,
														
 
															-    cache_dir: str = DEFAULT_CACHE_DIR
														
 
															+    cache_dir: Optional[str] = None
														
 
															 ) -> None:
														
 
															     """
														
 
															     保存数据到缓存
														
@@ -205,6 +214,9 @@ def _save_to_cache(
 
															         result: 结果数据（原始字符串）
														
 
															         cache_dir: 缓存目录
														
 
															     """
														
 
															+    if cache_dir is None:
														
 
															+        cache_dir = _get_default_cache_dir()
														
 
															+
														
 
															     cache_file = _get_cache_filepath(cache_key, phrase_a, phrase_b, model_name, temperature, cache_dir)
														
 
															     # 确保缓存目录存在
														
@@ -254,7 +266,7 @@ async def _difference_between_phrases(
 
															     tools: list = None,
														
 
															     name: str = "Semantic Similarity Analyzer",
														
 
															     use_cache: bool = True,
														
 
															-    cache_dir: str = DEFAULT_CACHE_DIR
														
 
															+    cache_dir: Optional[str] = None
														
 
															 ) -> str:
														
 
															     """
														
 
															     从语义角度判断两个短语的相似度
														
@@ -277,7 +289,7 @@ async def _difference_between_phrases(
 
															         tools: Agent 可用的工具列表，默认为 []
														
 
															         name: Agent 的名称，默认为 "Semantic Similarity Analyzer"（不参与缓存key构建）
														
 
															         use_cache: 是否使用缓存，默认 True
														
 
															-        cache_dir: 缓存目录，默认 'cache/semantic_similarity'
														
 
															+        cache_dir: 缓存目录，默认从配置读取（可通过 lib.config 设置）
														
 
															     Returns:
														
 
															         JSON 格式的相似度分析结果字符串
														
@@ -345,13 +357,8 @@ async def _difference_between_phrases(
 
															     result = await Runner.run(agent, input=prompt)
														
 
															     final_output = result.final_output
														
 
															-    # 保存到缓存
														
 
															-    if use_cache:
														
 
															-        _save_to_cache(
														
 
															-            cache_key, phrase_a, phrase_b, model_name,
														
 
															-            temperature, max_tokens, prompt_template,
														
 
															-            instructions, tools_str, final_output, cache_dir
														
 
															-        )
														
 
															+    # 注意：不在这里缓存，而是在解析成功后缓存
														
 
															+    # 这样可以避免缓存解析失败的响应
														
 
															     return final_output
														
@@ -367,7 +374,7 @@ async def _difference_between_phrases_parsed(
 
															     tools: list = None,
														
 
															     name: str = "Semantic Similarity Analyzer",
														
 
															     use_cache: bool = True,
														
 
															-    cache_dir: str = DEFAULT_CACHE_DIR
														
 
															+    cache_dir: Optional[str] = None
														
 
															 ) -> Dict[str, Any]:
														
 
															     """
														
 
															     从语义角度判断两个短语的相似度，并解析返回结果为字典
														
@@ -383,13 +390,16 @@ async def _difference_between_phrases_parsed(
 
															         tools: Agent 可用的工具列表，默认为 []
														
 
															         name: Agent 的名称，默认为 "Semantic Similarity Analyzer"
														
 
															         use_cache: 是否使用缓存，默认 True
														
 
															-        cache_dir: 缓存目录，默认 'cache/semantic_similarity'
														
 
															+        cache_dir: 缓存目录，默认从配置读取（可通过 lib.config 设置）
														
 
															     Returns:
														
 
															         解析后的字典，包含：
														
 
															         - 说明: 相似度判断的理由
														
 
															         - 相似度: 0-1之间的浮点数
														
 
															+    Raises:
														
 
															+        ValueError: 当无法解析AI响应为有效JSON时抛出
														
 
															+
														
 
															     Examples:
														
 
															         >>> result = await difference_between_phrases_parsed("宿命感", "余华的小说")
														
 
															         >>> print(result['相似度'])
														
@@ -397,21 +407,68 @@ async def _difference_between_phrases_parsed(
 
															         >>> print(result['说明'])
														
 
															         "两个概念有一定关联..."
														
 
															     """
														
 
															+    # 使用默认模板或自定义模板
														
 
															+    if prompt_template is None:
														
 
															+        prompt_template = DEFAULT_PROMPT_TEMPLATE
														
 
															+
														
 
															+    # 默认tools为空列表
														
 
															+    if tools is None:
														
 
															+        tools = []
														
 
															+
														
 
															+    # 生成缓存键
														
 
															+    tools_str = json.dumps(tools, sort_keys=True) if tools else "[]"
														
 
															+    cache_key = _generate_cache_key(
														
 
															+        phrase_a, phrase_b, model_name, temperature, max_tokens, prompt_template, instructions, tools_str
														
 
															+    )
														
 
															+
														
 
															+    # 尝试从缓存加载
														
 
															+    if use_cache:
														
 
															+        cached_result = _load_from_cache(cache_key, phrase_a, phrase_b, model_name, temperature, cache_dir)
														
 
															+        if cached_result is not None:
														
 
															+            # 缓存命中，直接解析并返回
														
 
															+            parsed_result = parse_json_from_text(cached_result)
														
 
															+            if parsed_result:
														
 
															+                return parsed_result
														
 
															+            # 如果缓存的内容也无法解析，继续执行API调用（可能之前缓存了错误响应）
														
 
															+
														
 
															+    # 调用AI获取原始响应（不传use_cache，因为我们在这里手动处理缓存）
														
 
															     raw_result = await _difference_between_phrases(
														
 
															         phrase_a, phrase_b, model_name, temperature, max_tokens,
														
 
															-        prompt_template, instructions, tools, name, use_cache, cache_dir
														
 
															+        prompt_template, instructions, tools, name, use_cache=False, cache_dir=cache_dir
														
 
															     )
														
 
															     # 使用 utils.parse_json_from_text 解析结果
														
 
															     parsed_result = parse_json_from_text(raw_result)
														
 
															-    # 如果解析失败（返回空字典），返回带错误信息的结果
														
 
															+    # 如果解析失败（返回空字典），抛出异常并包含详细信息
														
 
															     if not parsed_result:
														
 
															-        return {
														
 
															-            "说明": "解析失败: 无法从响应中提取有效的 JSON",
														
 
															-            "相似度": 0.0,
														
 
															-            "raw_response": raw_result
														
 
															-        }
														
 
															+        # 格式化prompt用于错误信息
														
 
															+        formatted_prompt = prompt_template.format(phrase_a=phrase_a, phrase_b=phrase_b)
														
 
															+
														
 
															+        error_msg = f"""
														
 
															+JSON解析失败！
														
 
															+================================================================================
														
 
															+短语A: {phrase_a}
														
 
															+短语B: {phrase_b}
														
 
															+模型: {model_name}
														
 
															+温度: {temperature}
														
 
															+================================================================================
														
 
															+Prompt:
														
 
															+{formatted_prompt}
														
 
															+================================================================================
														
 
															+AI响应 (长度: {len(raw_result)}):
														
 
															+{raw_result}
														
 
															+================================================================================
														
 
															+"""
														
 
															+        raise ValueError(error_msg)
														
 
															+
														
 
															+    # 只有解析成功后才缓存
														
 
															+    if use_cache:
														
 
															+        _save_to_cache(
														
 
															+            cache_key, phrase_a, phrase_b, model_name,
														
 
															+            temperature, max_tokens, prompt_template,
														
 
															+            instructions, tools_str, raw_result, cache_dir
														
 
															+        )
														
 
															     return parsed_result
														
@@ -430,7 +487,7 @@ async def compare_phrases(
 
															     tools: list = None,
														
 
															     name: str = "Semantic Similarity Analyzer",
														
 
															     use_cache: bool = True,
														
 
															-    cache_dir: str = DEFAULT_CACHE_DIR
														
 
															+    cache_dir: Optional[str] = None
														
 
															 ) -> Dict[str, Any]:
														
 
															     """
														
 
															     比较两个短语的语义相似度（对外唯一接口）
														
@@ -446,7 +503,7 @@ async def compare_phrases(
 
															         tools: Agent 可用的工具列表，默认为 []
														
 
															         name: Agent 的名称，默认为 "Semantic Similarity Analyzer"
														
 
															         use_cache: 是否使用缓存，默认 True
														
 
															-        cache_dir: 缓存目录，默认 'cache/semantic_similarity'
														
 
															+        cache_dir: 缓存目录，默认从配置读取（可通过 lib.config 设置）
														
 
															     Returns:
														
 
															         解析后的字典
														
@@ -540,7 +597,7 @@ async def compare_phrases_v2(
 
															     tools: list = None,
														
 
															     name: str = "Advanced Semantic Analyzer",
														
 
															     use_cache: bool = True,
														
 
															-    cache_dir: str = DEFAULT_CACHE_DIR
														
 
															+    cache_dir: Optional[str] = None
														
 
															 ) -> Dict[str, Any]:
														
 
															     """
														
 
															     比较两个短语的语义相似度 - V2 版本（详细分析）
														
@@ -561,7 +618,7 @@ async def compare_phrases_v2(
 
															         tools: Agent 可用的工具列表，默认为 []
														
 
															         name: Agent 的名称，默认 "Advanced Semantic Analyzer"
														
 
															         use_cache: 是否使用缓存，默认 True
														
 
															-        cache_dir: 缓存目录，默认 'cache/semantic_similarity'
														
 
															+        cache_dir: 缓存目录，默认从配置读取（可通过 lib.config 设置）
														
 
															     Returns:
														
 
															         解析后的字典，包含：
														
--- a/lib/text_embedding.py
+++ b/lib/text_embedding.py
@@ -11,6 +11,8 @@ from pathlib import Path
 
															 from datetime import datetime
														
 
															 import threading
														
 
															+from .config import get_cache_dir
														
 
															+
														
 
															 # 支持的模型列表
														
 
															 SUPPORTED_MODELS = {
														
 
															     "chinese": "shibing624/text2vec-base-chinese",           # 默认，中文通用
														
@@ -23,8 +25,10 @@ SUPPORTED_MODELS = {
 
															 _similarity_models = {}  # 存储多个模型实例
														
 
															 _model_lock = threading.Lock()  # 线程锁，保护模型加载
														
 
															-# 默认缓存目录
														
 
															-DEFAULT_CACHE_DIR = "cache/text_embedding"
														
 
															+
														
 
															+def _get_default_cache_dir() -> str:
														
 
															+    """获取默认缓存目录（从配置中读取）"""
														
 
															+    return get_cache_dir("text_embedding")
														
 
															 def _generate_cache_key(phrase_a: str, phrase_b: str, model_name: str) -> str:
														
@@ -70,7 +74,7 @@ def _get_cache_filepath(
 
															     phrase_a: str,
														
 
															     phrase_b: str,
														
 
															     model_name: str,
														
 
															-    cache_dir: str = DEFAULT_CACHE_DIR
														
 
															+    cache_dir: Optional[str] = None
														
 
															 ) -> Path:
														
 
															     """
														
 
															     获取缓存文件路径（可读文件名）
														
@@ -87,6 +91,9 @@ def _get_cache_filepath(
 
															     文件名格式: {phrase_a}_vs_{phrase_b}_{model}_{hash[:8]}.json
														
 
															     """
														
 
															+    if cache_dir is None:
														
 
															+        cache_dir = _get_default_cache_dir()
														
 
															+
														
 
															     # 清理短语和模型名
														
 
															     clean_a = _sanitize_for_filename(phrase_a, max_length=20)
														
 
															     clean_b = _sanitize_for_filename(phrase_b, max_length=20)
														
@@ -109,7 +116,7 @@ def _load_from_cache(
 
															     phrase_a: str,
														
 
															     phrase_b: str,
														
 
															     model_name: str,
														
 
															-    cache_dir: str = DEFAULT_CACHE_DIR
														
 
															+    cache_dir: Optional[str] = None
														
 
															 ) -> Optional[Dict[str, Any]]:
														
 
															     """
														
 
															     从缓存加载数据
														
@@ -124,6 +131,9 @@ def _load_from_cache(
 
															     Returns:
														
 
															         缓存的结果字典，如果不存在则返回 None
														
 
															     """
														
 
															+    if cache_dir is None:
														
 
															+        cache_dir = _get_default_cache_dir()
														
 
															+
														
 
															     cache_file = _get_cache_filepath(cache_key, phrase_a, phrase_b, model_name, cache_dir)
														
 
															     # 如果文件不存在，尝试通过哈希匹配查找
														
@@ -153,7 +163,7 @@ def _save_to_cache(
 
															     phrase_b: str,
														
 
															     model_name: str,
														
 
															     result: Dict[str, Any],
														
 
															-    cache_dir: str = DEFAULT_CACHE_DIR
														
 
															+    cache_dir: Optional[str] = None
														
 
															 ) -> None:
														
 
															     """
														
 
															     保存数据到缓存
														
@@ -166,6 +176,9 @@ def _save_to_cache(
 
															         result: 结果数据（字典格式）
														
 
															         cache_dir: 缓存目录
														
 
															     """
														
 
															+    if cache_dir is None:
														
 
															+        cache_dir = _get_default_cache_dir()
														
 
															+
														
 
															     cache_file = _get_cache_filepath(cache_key, phrase_a, phrase_b, model_name, cache_dir)
														
 
															     # 确保缓存目录存在
														
@@ -237,7 +250,7 @@ def compare_phrases(
 
															     phrase_b: str,
														
 
															     model_name: str = "chinese",
														
 
															     use_cache: bool = True,
														
 
															-    cache_dir: str = DEFAULT_CACHE_DIR
														
 
															+    cache_dir: Optional[str] = None
														
 
															 ) -> Dict[str, Any]:
														
 
															     """
														
 
															     比较两个短语的语义相似度（兼容 semantic_similarity.py 的接口）
														
@@ -264,7 +277,7 @@ def compare_phrases(
 
															             - "shibing624/text2vec-base-chinese-paraphrase"
														
 
															             - "shibing624/text2vec-base-chinese-sentence"
														
 
															         use_cache: 是否使用缓存，默认 True
														
 
															-        cache_dir: 缓存目录，默认 'cache/text_embedding'
														
 
															+        cache_dir: 缓存目录，默认从配置读取（可通过 lib.config 设置）
														
 
															     Returns:
														
 
															         {
														
@@ -285,7 +298,13 @@ def compare_phrases(
 
															         >>> # 禁用缓存
														
 
															         >>> result = compare_phrases("测试", "测试", use_cache=False)
														
 
															+
														
 
															+        >>> # 自定义缓存目录
														
 
															+        >>> result = compare_phrases("测试1", "测试2", cache_dir="/tmp/my_cache")
														
 
															     """
														
 
															+    if cache_dir is None:
														
 
															+        cache_dir = _get_default_cache_dir()
														
 
															+
														
 
															     # 转换简称为完整名称（用于缓存键）
														
 
															     full_model_name = SUPPORTED_MODELS.get(model_name, model_name)
														
--- a/migrate_data_to_cache.py
+++ b/migrate_data_to_cache.py
@@ -0,0 +1,151 @@
 
															+#!/usr/bin/env python3
														
 
															+"""
														
 
															+数据目录迁移脚本
														
 
															+
														
 
															+将 data/ 目录迁移到 cache/data/ 目录下，实现统一的缓存管理。
														
 
															+
														
 
															+注意：
														
 
															+- 会保留 data/ 目录中的非缓存文件（如文档、配置等）
														
 
															+- 只移动缓存性质的数据（爬虫数据、分析结果等）
														
 
															+"""
														
 
															+
														
 
															+import os
														
 
															+import shutil
														
 
															+from pathlib import Path
														
 
															+
														
 
															+
														
 
															+def migrate_data_to_cache():
														
 
															+    """将 data 目录迁移到 cache/data"""
														
 
															+
														
 
															+    project_root = Path(__file__).parent
														
 
															+    old_data_dir = project_root / "data"
														
 
															+    new_data_dir = project_root / "cache" / "data"
														
 
															+
														
 
															+    print("=" * 60)
														
 
															+    print("数据目录迁移脚本")
														
 
															+    print("=" * 60)
														
 
															+    print(f"源目录: {old_data_dir}")
														
 
															+    print(f"目标目录: {new_data_dir}")
														
 
															+    print()
														
 
															+
														
 
															+    # 检查源目录是否存在
														
 
															+    if not old_data_dir.exists():
														
 
															+        print("✓ data/ 目录不存在，无需迁移")
														
 
															+        return
														
 
															+
														
 
															+    # 确保目标目录的父目录存在
														
 
															+    new_data_dir.parent.mkdir(parents=True, exist_ok=True)
														
 
															+
														
 
															+    # 如果目标目录已存在，询问是否覆盖
														
 
															+    if new_data_dir.exists():
														
 
															+        print(f"⚠️  目标目录已存在: {new_data_dir}")
														
 
															+        response = input("是否继续？这将合并两个目录的内容 (y/n): ")
														
 
															+        if response.lower() != 'y':
														
 
															+            print("取消迁移")
														
 
															+            return
														
 
															+
														
 
															+    print("开始迁移...")
														
 
															+    print()
														
 
															+
														
 
															+    # 统计信息
														
 
															+    total_files = 0
														
 
															+    total_dirs = 0
														
 
															+    skipped_items = []
														
 
															+
														
 
															+    # 需要跳过的目录和文件（非缓存数据）
														
 
															+    skip_patterns = {
														
 
															+        '.git',
														
 
															+        '.DS_Store',
														
 
															+        '__pycache__',
														
 
															+        '*.md',  # 文档文件
														
 
															+        '*.txt', # 说明文件
														
 
															+        'README*',
														
 
															+    }
														
 
															+
														
 
															+    # 遍历源目录
														
 
															+    for item in old_data_dir.iterdir():
														
 
															+        item_name = item.name
														
 
															+
														
 
															+        # 检查是否应该跳过
														
 
															+        should_skip = False
														
 
															+        for pattern in skip_patterns:
														
 
															+            if pattern.startswith('*'):
														
 
															+                if item_name.endswith(pattern[1:]):
														
 
															+                    should_skip = True
														
 
															+                    break
														
 
															+            elif pattern.endswith('*'):
														
 
															+                if item_name.startswith(pattern[:-1]):
														
 
															+                    should_skip = True
														
 
															+                    break
														
 
															+            elif item_name == pattern:
														
 
															+                should_skip = True
														
 
															+                break
														
 
															+
														
 
															+        if should_skip:
														
 
															+            skipped_items.append(item_name)
														
 
															+            print(f"⊘ 跳过: {item_name} (非缓存数据)")
														
 
															+            continue
														
 
															+
														
 
															+        # 目标路径
														
 
															+        target = new_data_dir / item_name
														
 
															+
														
 
															+        try:
														
 
															+            if item.is_dir():
														
 
															+                # 如果目标已存在，合并；否则直接移动
														
 
															+                if target.exists():
														
 
															+                    print(f"→ 合并目录: {item_name}/")
														
 
															+                    # 递归复制并删除源
														
 
															+                    shutil.copytree(item, target, dirs_exist_ok=True)
														
 
															+                    shutil.rmtree(item)
														
 
															+                else:
														
 
															+                    print(f"→ 移动目录: {item_name}/")
														
 
															+                    shutil.move(str(item), str(target))
														
 
															+                total_dirs += 1
														
 
															+            else:
														
 
															+                # 文件直接移动
														
 
															+                if target.exists():
														
 
															+                    print(f"→ 覆盖文件: {item_name}")
														
 
															+                else:
														
 
															+                    print(f"→ 移动文件: {item_name}")
														
 
															+                shutil.move(str(item), str(target))
														
 
															+                total_files += 1
														
 
															+        except Exception as e:
														
 
															+            print(f"✗ 移动失败: {item_name}")
														
 
															+            print(f"  错误: {e}")
														
 
															+
														
 
															+    print()
														
 
															+    print("=" * 60)
														
 
															+    print("迁移完成！")
														
 
															+    print("=" * 60)
														
 
															+    print(f"移动的目录数: {total_dirs}")
														
 
															+    print(f"移动的文件数: {total_files}")
														
 
															+    if skipped_items:
														
 
															+        print(f"跳过的项目数: {len(skipped_items)}")
														
 
															+        print(f"跳过的项目: {', '.join(skipped_items)}")
														
 
															+    print()
														
 
															+
														
 
															+    # 检查 data 目录是否为空
														
 
															+    remaining_items = list(old_data_dir.iterdir())
														
 
															+    if not remaining_items:
														
 
															+        print(f"✓ 源目录 {old_data_dir} 已空，可以删除")
														
 
															+        response = input("是否删除空的 data/ 目录？(y/n): ")
														
 
															+        if response.lower() == 'y':
														
 
															+            old_data_dir.rmdir()
														
 
															+            print(f"✓ 已删除 {old_data_dir}")
														
 
															+    else:
														
 
															+        print(f"⚠️  源目录 {old_data_dir} 中还有以下内容：")
														
 
															+        for item in remaining_items:
														
 
															+            print(f"  - {item.name}")
														
 
															+        print("这些可能是非缓存数据，已保留在原位置")
														
 
															+
														
 
															+    print()
														
 
															+    print("=" * 60)
														
 
															+    print("下一步操作：")
														
 
															+    print("1. 验证迁移结果：检查 cache/data/ 目录中的数据是否完整")
														
 
															+    print("2. 如果一切正常，可以删除旧的 data/ 目录（如果为空）")
														
 
															+    print("3. 更新你的脚本或配置，使用新的路径")
														
 
															+    print("=" * 60)
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    migrate_data_to_cache()
														
--- a/script/analysis/analyze_model_comparison.py
+++ b/script/analysis/analyze_model_comparison.py
@@ -18,6 +18,8 @@ import pandas as pd
 
															 from datetime import datetime
														
 
															 # 添加项目根目录到路径
														
 
															+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
														
 
															+from lib.config import get_cache_dir
														
 
															 project_root = Path(__file__).parent.parent.parent
														
 
															 sys.path.insert(0, str(project_root))
														
@@ -237,9 +239,9 @@ def export_to_excel(
 
															 def main():
														
 
															     """主函数"""
														
 
															-    # 配置参数
														
 
															-    text_embedding_cache = "cache/text_embedding"
														
 
															-    semantic_similarity_cache = "cache/semantic_similarity"
														
 
															+    # 配置参数（从配置模块获取）
														
 
															+    text_embedding_cache = get_cache_dir("text_embedding")
														
 
															+    semantic_similarity_cache = get_cache_dir("semantic_similarity")
														
 
															     output_file = f"data/model_comparison_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
														
 
															     print("=" * 60)
														
--- a/script/analysis/test_all_models.py
+++ b/script/analysis/test_all_models.py
@@ -19,6 +19,8 @@ from datetime import datetime
 
															 project_root = Path(__file__).parent.parent.parent
														
 
															 sys.path.insert(0, str(project_root))
														
 
															+from lib.config import get_cache_dir
														
 
															+
														
 
															 from lib.text_embedding import compare_phrases, SUPPORTED_MODELS
														
 
															 # 全局并发限制
														
@@ -88,7 +90,7 @@ def get_semaphore():
 
															 def extract_test_cases_from_cache(
														
 
															-    cache_dir: str = "cache/text_embedding"
														
 
															+    cache_dir: str = None
														
 
															 ) -> List[Tuple[str, str]]:
														
 
															     """
														
 
															     从现有缓存文件中提取所有测试用例
														
@@ -99,6 +101,9 @@ def extract_test_cases_from_cache(
 
															     Returns:
														
 
															         测试用例列表，每项为 (phrase_a, phrase_b) 元组
														
 
															     """
														
 
															+    if cache_dir is None:
														
 
															+        cache_dir = get_cache_dir("text_embedding")
														
 
															+
														
 
															     cache_path = Path(cache_dir)
														
 
															     if not cache_path.exists():
														
@@ -304,8 +309,8 @@ def save_results(
 
															 async def main():
														
 
															     """主函数"""
														
 
															-    # 配置参数
														
 
															-    cache_dir = "cache/text_embedding"
														
 
															+    # 配置参数（从配置模块获取）
														
 
															+    cache_dir = get_cache_dir("text_embedding")
														
 
															     output_file = "data/model_comparison_results.json"
														
 
															     # 步骤 1: 从缓存提取测试用例
														
--- a/script/detail/xiaohongshu_detail.py
+++ b/script/detail/xiaohongshu_detail.py
@@ -13,6 +13,13 @@ import hashlib
 
															 import re
														
 
															 from datetime import datetime
														
 
															 from typing import Dict, Any, Optional
														
 
															+import sys
														
 
															+from pathlib import Path
														
 
															+
														
 
															+# 添加项目根目录到路径并导入配置
														
 
															+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
														
 
															+from lib.config import get_data_dir
														
 
															+
														
 
															 from pathlib import Path
														
@@ -38,10 +45,9 @@ class XiaohongshuDetail:
 
															         if results_dir:
														
 
															             self.results_base_dir = results_dir
														
 
															         else:
														
 
															-            # 默认使用项目根目录的 data/detail 文件夹
														
 
															-            script_dir = os.path.dirname(os.path.abspath(__file__))
														
 
															-            project_root = os.path.dirname(os.path.dirname(script_dir))
														
 
															-            self.results_base_dir = os.path.join(project_root, "data", "detail")
														
 
															+            # 默认从配置读取
														
 
															+
														
 
															+            self.results_base_dir = get_data_dir("detail")
														
 
															     def _sanitize_note_id(self, note_id: str) -> str:
														
 
															         """
														
@@ -403,8 +409,8 @@ def main():
 
															     parser.add_argument(
														
 
															         '--results-dir',
														
 
															         type=str,
														
 
															-        default='data/detail',
														
 
															-        help='结果输出目录 (默认: data/detail)'
														
 
															+        default=None,
														
 
															+        help='结果输出目录 (默认: 从配置读取)'
														
 
															     )
														
 
															     parser.add_argument(
														
 
															         '--note-id',
														
--- a/script/get_tools_list.py
+++ b/script/get_tools_list.py
@@ -10,6 +10,13 @@ import os
 
															 import argparse
														
 
															 from datetime import datetime
														
 
															 from typing import Dict, Any
														
 
															+import sys
														
 
															+from pathlib import Path
														
 
															+
														
 
															+# 添加项目根目录到路径并导入配置
														
 
															+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
														
 
															+from lib.config import get_data_dir
														
 
															+
														
 
															 class ToolsListFetcher:
														
@@ -31,10 +38,8 @@ class ToolsListFetcher:
 
															         if results_dir:
														
 
															             self.results_base_dir = results_dir
														
 
															         else:
														
 
															-            # 默认使用项目根目录的 data/tools_list 文件夹
														
 
															-            script_dir = os.path.dirname(os.path.abspath(__file__))
														
 
															-            project_root = os.path.dirname(script_dir)
														
 
															-            self.results_base_dir = os.path.join(project_root, "data", "tools_list")
														
 
															+            # 默认从配置读取
														
 
															+            self.results_base_dir = get_data_dir("tools_list")
														
 
															     def get_tools_list(self, timeout: int = 30) -> Dict[str, Any]:
														
 
															         """
														
@@ -94,8 +99,8 @@ def main():
 
															     parser.add_argument(
														
 
															         '--results-dir',
														
 
															         type=str,
														
 
															-        default='data/tools_list',
														
 
															-        help='结果输出目录 (默认: data/tools_list)'
														
 
															+        default=None,
														
 
															+        help='结果输出目录 (默认: 从配置读取)'
														
 
															     )
														
 
															     args = parser.parse_args()
														
--- a/script/search/ai_search.py
+++ b/script/search/ai_search.py
@@ -10,6 +10,12 @@ import os
 
															 import argparse
														
 
															 from datetime import datetime
														
 
															 from typing import Dict, Any
														
 
															+import sys
														
 
															+from pathlib import Path
														
 
															+
														
 
															+# 添加项目根目录到路径并导入配置
														
 
															+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
														
 
															+from lib.config import get_data_dir
														
 
															 class AISearch:
														
@@ -31,10 +37,8 @@ class AISearch:
 
															         if results_dir:
														
 
															             self.results_base_dir = results_dir
														
 
															         else:
														
 
															-            # 默认使用项目根目录的 data/search 文件夹
														
 
															-            script_dir = os.path.dirname(os.path.abspath(__file__))
														
 
															-            project_root = os.path.dirname(os.path.dirname(script_dir))
														
 
															-            self.results_base_dir = os.path.join(project_root, "data", "search")
														
 
															+            # 默认从配置读取
														
 
															+            self.results_base_dir = get_data_dir("search")
														
 
															     def search(self, query: str, timeout: int = 60) -> Dict[str, Any]:
														
 
															         """
														
@@ -109,8 +113,8 @@ def main():
 
															     parser.add_argument(
														
 
															         '--results-dir',
														
 
															         type=str,
														
 
															-        default='data/search',
														
 
															-        help='结果输出目录 (默认: data/search)'
														
 
															+        default=None,
														
 
															+        help='结果输出目录 (默认: 从配置读取，通常为 data/search)'
														
 
															     )
														
 
															     parser.add_argument(
														
 
															         '--query',
														
--- a/script/search/custom_search.py
+++ b/script/search/custom_search.py
@@ -10,6 +10,13 @@ import os
 
															 import argparse
														
 
															 from datetime import datetime
														
 
															 from typing import Dict, Any
														
 
															+import sys
														
 
															+from pathlib import Path
														
 
															+
														
 
															+# 添加项目根目录到路径并导入配置
														
 
															+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
														
 
															+from lib.config import get_data_dir
														
 
															+
														
 
															 class CustomSearch:
														
@@ -31,10 +38,9 @@ class CustomSearch:
 
															         if results_dir:
														
 
															             self.results_base_dir = results_dir
														
 
															         else:
														
 
															-            # 默认使用项目根目录的 data/search 文件夹
														
 
															-            script_dir = os.path.dirname(os.path.abspath(__file__))
														
 
															-            project_root = os.path.dirname(os.path.dirname(script_dir))
														
 
															-            self.results_base_dir = os.path.join(project_root, "data", "search")
														
 
															+            # 默认从配置读取
														
 
															+
														
 
															+            self.results_base_dir = get_data_dir("search")
														
 
															     def search(self, keyword: str, platform: str = "google", timeout: int = 30) -> Dict[str, Any]:
														
 
															         """
														
@@ -105,8 +111,8 @@ def main():
 
															     parser.add_argument(
														
 
															         '--results-dir',
														
 
															         type=str,
														
 
															-        default='data/search',
														
 
															-        help='结果输出目录 (默认: data/search)'
														
 
															+        default=None,
														
 
															+        help='结果输出目录 (默认: 从配置读取)'
														
 
															     )
														
 
															     parser.add_argument(
														
 
															         '--keyword',
														
--- a/script/search/douyin_search.py
+++ b/script/search/douyin_search.py
@@ -10,6 +10,13 @@ import os
 
															 import argparse
														
 
															 from datetime import datetime
														
 
															 from typing import Dict, Any
														
 
															+import sys
														
 
															+from pathlib import Path
														
 
															+
														
 
															+# 添加项目根目录到路径并导入配置
														
 
															+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
														
 
															+from lib.config import get_data_dir
														
 
															+
														
 
															 class DouyinSearch:
														
@@ -32,10 +39,9 @@ class DouyinSearch:
 
															         if results_dir:
														
 
															             self.results_base_dir = results_dir
														
 
															         else:
														
 
															-            # 默认使用项目根目录的 data/search 文件夹
														
 
															-            script_dir = os.path.dirname(os.path.abspath(__file__))
														
 
															-            project_root = os.path.dirname(os.path.dirname(script_dir))
														
 
															-            self.results_base_dir = os.path.join(project_root, "data", "search")
														
 
															+            # 默认从配置读取
														
 
															+
														
 
															+            self.results_base_dir = get_data_dir("search")
														
 
															     def search(self, keyword: str, timeout: int = 30) -> Dict[str, Any]:
														
 
															         """
														
@@ -103,8 +109,8 @@ def main():
 
															     parser.add_argument(
														
 
															         '--results-dir',
														
 
															         type=str,
														
 
															-        default='data/search',
														
 
															-        help='结果输出目录 (默认: data/search)'
														
 
															+        default=None,
														
 
															+        help='结果输出目录 (默认: 从配置读取)'
														
 
															     )
														
 
															     parser.add_argument(
														
 
															         '--keyword',
														
--- a/script/search/xiaohongshu_search.py
+++ b/script/search/xiaohongshu_search.py
@@ -13,6 +13,13 @@ import hashlib
 
															 import re
														
 
															 from datetime import datetime
														
 
															 from typing import Dict, Any, Optional, Tuple
														
 
															+import sys
														
 
															+from pathlib import Path
														
 
															+
														
 
															+# 添加项目根目录到路径并导入配置
														
 
															+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
														
 
															+from lib.config import get_data_dir
														
 
															+
														
 
															 from copy import deepcopy
														
 
															 from pathlib import Path
														
@@ -39,10 +46,9 @@ class XiaohongshuSearch:
 
															         if results_dir:
														
 
															             self.results_base_dir = results_dir
														
 
															         else:
														
 
															-            # 默认使用项目根目录的 data/search 文件夹
														
 
															-            script_dir = os.path.dirname(os.path.abspath(__file__))
														
 
															-            project_root = os.path.dirname(os.path.dirname(script_dir))
														
 
															-            self.results_base_dir = os.path.join(project_root, "data", "search")
														
 
															+            # 默认从配置读取
														
 
															+
														
 
															+            self.results_base_dir = get_data_dir("search")
														
 
															     def _sanitize_keyword(self, keyword: str) -> str:
														
 
															         """
														
@@ -480,8 +486,8 @@ def main():
 
															     parser.add_argument(
														
 
															         '--results-dir',
														
 
															         type=str,
														
 
															-        default='data/search',
														
 
															-        help='结果输出目录 (默认: data/search)'
														
 
															+        default=None,
														
 
															+        help='结果输出目录 (默认: 从配置读取)'
														
 
															     )
														
 
															     parser.add_argument(
														
 
															         '--keyword',
														
--- a/script/search_recommendations/bilibili_search_recommendations.py
+++ b/script/search_recommendations/bilibili_search_recommendations.py
@@ -10,6 +10,13 @@ import os
 
															 import argparse
														
 
															 from datetime import datetime
														
 
															 from typing import Dict, Any
														
 
															+import sys
														
 
															+from pathlib import Path
														
 
															+
														
 
															+# 添加项目根目录到路径并导入配置
														
 
															+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
														
 
															+from lib.config import get_data_dir
														
 
															+
														
 
															 class BilibiliSearchRecommendations:
														
@@ -32,10 +39,9 @@ class BilibiliSearchRecommendations:
 
															         if results_dir:
														
 
															             self.results_base_dir = results_dir
														
 
															         else:
														
 
															-            # 默认使用项目根目录的 data/search_recommendations 文件夹
														
 
															-            script_dir = os.path.dirname(os.path.abspath(__file__))
														
 
															-            project_root = os.path.dirname(os.path.dirname(script_dir))
														
 
															-            self.results_base_dir = os.path.join(project_root, "data", "search_recommendations")
														
 
															+            # 默认从配置读取
														
 
															+
														
 
															+            self.results_base_dir = get_data_dir("search_recommendations")
														
 
															     def get_recommendations(self, keyword: str, timeout: int = 30) -> Dict[str, Any]:
														
 
															         """
														
@@ -103,8 +109,8 @@ def main():
 
															     parser.add_argument(
														
 
															         '--results-dir',
														
 
															         type=str,
														
 
															-        default='data/search_recommendations',
														
 
															-        help='结果输出目录 (默认: data/search_recommendations)'
														
 
															+        default=None,
														
 
															+        help='结果输出目录 (默认: 从配置读取)'
														
 
															     )
														
 
															     parser.add_argument(
														
 
															         '--keyword',
														
--- a/script/search_recommendations/douyin_search_recommendations.py
+++ b/script/search_recommendations/douyin_search_recommendations.py
@@ -10,6 +10,13 @@ import os
 
															 import argparse
														
 
															 from datetime import datetime
														
 
															 from typing import Dict, Any
														
 
															+import sys
														
 
															+from pathlib import Path
														
 
															+
														
 
															+# 添加项目根目录到路径并导入配置
														
 
															+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
														
 
															+from lib.config import get_data_dir
														
 
															+
														
 
															 class DouyinSearchRecommendations:
														
@@ -32,10 +39,9 @@ class DouyinSearchRecommendations:
 
															         if results_dir:
														
 
															             self.results_base_dir = results_dir
														
 
															         else:
														
 
															-            # 默认使用项目根目录的 data/search_recommendations 文件夹
														
 
															-            script_dir = os.path.dirname(os.path.abspath(__file__))
														
 
															-            project_root = os.path.dirname(os.path.dirname(script_dir))
														
 
															-            self.results_base_dir = os.path.join(project_root, "data", "search_recommendations")
														
 
															+            # 默认从配置读取
														
 
															+
														
 
															+            self.results_base_dir = get_data_dir("search_recommendations")
														
 
															     def get_recommendations(self, keyword: str, timeout: int = 30) -> Dict[str, Any]:
														
 
															         """
														
@@ -103,8 +109,8 @@ def main():
 
															     parser.add_argument(
														
 
															         '--results-dir',
														
 
															         type=str,
														
 
															-        default='data/search_recommendations',
														
 
															-        help='结果输出目录 (默认: data/search_recommendations)'
														
 
															+        default=None,
														
 
															+        help='结果输出目录 (默认: 从配置读取)'
														
 
															     )
														
 
															     parser.add_argument(
														
 
															         '--keyword',
														
--- a/script/search_recommendations/xiaohongshu_search_recommendations.py
+++ b/script/search_recommendations/xiaohongshu_search_recommendations.py
@@ -13,6 +13,13 @@ import ast
 
															 import hashlib
														
 
															 from datetime import datetime
														
 
															 from typing import Dict, Any, Optional
														
 
															+import sys
														
 
															+from pathlib import Path
														
 
															+
														
 
															+# 添加项目根目录到路径并导入配置
														
 
															+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
														
 
															+from lib.config import get_data_dir
														
 
															+
														
 
															 class XiaohongshuSearchRecommendations:
														
@@ -37,10 +44,9 @@ class XiaohongshuSearchRecommendations:
 
															         if results_dir:
														
 
															             self.results_base_dir = results_dir
														
 
															         else:
														
 
															-            # 默认使用项目根目录的 data/search_recommendations 文件夹
														
 
															-            script_dir = os.path.dirname(os.path.abspath(__file__))
														
 
															-            project_root = os.path.dirname(os.path.dirname(script_dir))
														
 
															-            self.results_base_dir = os.path.join(project_root, "data", "search_recommendations")
														
 
															+            # 默认从配置读取
														
 
															+
														
 
															+            self.results_base_dir = get_data_dir("search_recommendations")
														
 
															         # 缓存设置
														
 
															         self.enable_cache = enable_cache
														
@@ -311,8 +317,8 @@ def main():
 
															     parser.add_argument(
														
 
															         '--results-dir',
														
 
															         type=str,
														
 
															-        default='data/search_recommendations',
														
 
															-        help='结果输出目录 (默认: data/search_recommendations)'
														
 
															+        default=None,
														
 
															+        help='结果输出目录 (默认: 从配置读取)'
														
 
															     )
														
 
															     parser.add_argument(
														
 
															         '--keyword',
														
--- a/script/search_tagwords/douyin_search_tagword.py
+++ b/script/search_tagwords/douyin_search_tagword.py
@@ -10,6 +10,13 @@ import os
 
															 import argparse
														
 
															 from datetime import datetime
														
 
															 from typing import Dict, Any
														
 
															+import sys
														
 
															+from pathlib import Path
														
 
															+
														
 
															+# 添加项目根目录到路径并导入配置
														
 
															+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
														
 
															+from lib.config import get_data_dir
														
 
															+
														
 
															 class DouyinSearchTagWord:
														
@@ -32,10 +39,9 @@ class DouyinSearchTagWord:
 
															         if results_dir:
														
 
															             self.results_base_dir = results_dir
														
 
															         else:
														
 
															-            # 默认使用项目根目录的 data/search_tagwords 文件夹
														
 
															-            script_dir = os.path.dirname(os.path.abspath(__file__))
														
 
															-            project_root = os.path.dirname(os.path.dirname(script_dir))
														
 
															-            self.results_base_dir = os.path.join(project_root, "data", "search_tagwords")
														
 
															+            # 默认从配置读取
														
 
															+
														
 
															+            self.results_base_dir = get_data_dir("search_tagwords")
														
 
															     def get_tagwords(self, keyword: str, timeout: int = 30) -> Dict[str, Any]:
														
 
															         """
														
@@ -103,8 +109,8 @@ def main():
 
															     parser.add_argument(
														
 
															         '--results-dir',
														
 
															         type=str,
														
 
															-        default='data/search_tagwords',
														
 
															-        help='结果输出目录 (默认: data/search_tagwords)'
														
 
															+        default=None,
														
 
															+        help='结果输出目录 (默认: 从配置读取)'
														
 
															     )
														
 
															     parser.add_argument(
														
 
															         '--keyword',
														
--- a/script/search_tagwords/xiaohongshu_search_hashtag.py
+++ b/script/search_tagwords/xiaohongshu_search_hashtag.py
@@ -11,6 +11,13 @@ import os
 
															 import argparse
														
 
															 from datetime import datetime
														
 
															 from typing import Dict, Any
														
 
															+import sys
														
 
															+from pathlib import Path
														
 
															+
														
 
															+# 添加项目根目录到路径并导入配置
														
 
															+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
														
 
															+from lib.config import get_data_dir
														
 
															+
														
 
															 class XiaohongshuSearchHashtag:
														
@@ -33,10 +40,9 @@ class XiaohongshuSearchHashtag:
 
															         if results_dir:
														
 
															             self.results_base_dir = results_dir
														
 
															         else:
														
 
															-            # 默认使用项目根目录的 data/search_tagwords 文件夹
														
 
															-            script_dir = os.path.dirname(os.path.abspath(__file__))
														
 
															-            project_root = os.path.dirname(os.path.dirname(script_dir))
														
 
															-            self.results_base_dir = os.path.join(project_root, "data", "search_tagwords")
														
 
															+            # 默认从配置读取
														
 
															+
														
 
															+            self.results_base_dir = get_data_dir("search_tagwords")
														
 
															     def get_hashtags(self, prompt: str, timeout: int = 60) -> Dict[str, Any]:
														
 
															         """
														
@@ -112,8 +118,8 @@ def main():
 
															     parser.add_argument(
														
 
															         '--results-dir',
														
 
															         type=str,
														
 
															-        default='data/search_tagwords',
														
 
															-        help='结果输出目录 (默认: data/search_tagwords)'
														
 
															+        default=None,
														
 
															+        help='结果输出目录 (默认: 从配置读取)'
														
 
															     )
														
 
															     parser.add_argument(
														
 
															         '--keyword',