vor 1 Woche · c4c633e197
--- a/CACHE_CONFIG.md
+++ b/CACHE_CONFIG.md
@@ -0,0 +1,354 @@
 
				+# 缓存路径配置说明
			
 
				+
			
 
				+## 概述
			
 
				+
			
 
				+本项目已实现统一的缓存路径管理，所有缓存数据默认存储在 `~/cache/` 目录下，通过 `lib/config.py` 模块进行配置。
			
 
				+
			
 
				+## 目录结构
			
 
				+
			
 
				+```
			
 
				+~/cache/                        # 缓存根目录（默认：~/cache，可配置）
			
 
				+├── text_embedding/            # 向量相似度计算缓存
			
 
				+├── semantic_similarity/       # 语义相似度计算缓存
			
 
				+└── data/                      # 数据缓存（爬虫、分析等）
			
 
				+    ├── search/                # 搜索结果缓存
			
 
				+    ├── detail/                # 详情数据缓存
			
 
				+    └── tools_list/            # 工具列表缓存
			
 
				+
			
 
				+data/                          # 非缓存数据（项目数据、配置等）
			
 
				+├── 阿里多多酱/                # 账号相关数据
			
 
				+├── data_1117/                # 特定日期数据
			
 
				+└── ...                       # 其他非缓存文件
			
 
				+```
			
 
				+
			
 
				+## 使用方法
			
 
				+
			
 
				+### 1. 使用默认配置（推荐）
			
 
				+
			
 
				+默认情况下，所有缓存文件存储在用户主目录的 `~/cache/` 目录下，**无需任何配置**：
			
 
				+
			
 
				+```python
			
 
				+from lib.text_embedding import compare_phrases
			
 
				+
			
 
				+# 计算缓存：~/cache/text_embedding/
			
 
				+result = compare_phrases("深度学习", "神经网络")
			
 
				+```
			
 
				+
			
 
				+```bash
			
 
				+# 数据缓存：~/cache/data/search/
			
 
				+python script/search/ai_search.py --query "深度学习"
			
 
				+```
			
 
				+
			
 
				+### 2. 通过代码设置缓存根目录
			
 
				+
			
 
				+在程序开始时，可以通过代码设置全局的缓存根目录，所有缓存（包括计算缓存和数据缓存）都会使用新路径：
			
 
				+
			
 
				+```python
			
 
				+from lib.config import set_cache_root
			
 
				+from lib.text_embedding import compare_phrases
			
 
				+
			
 
				+# 设置缓存根目录
			
 
				+set_cache_root("/custom/cache")
			
 
				+
			
 
				+# 计算缓存：/custom/cache/text_embedding/
			
 
				+result = compare_phrases("深度学习", "神经网络")
			
 
				+
			
 
				+# 数据缓存：/custom/cache/data/search/
			
 
				+# 运行爬虫脚本时也会使用新路径
			
 
				+```
			
 
				+
			
 
				+### 3. 通过环境变量设置缓存根目录
			
 
				+
			
 
				+可以在运行程序前设置环境变量，所有缓存都会使用新路径：
			
 
				+
			
 
				+```bash
			
 
				+# Linux/Mac
			
 
				+export CACHE_ROOT=/custom/cache
			
 
				+python your_script.py
			
 
				+# 计算缓存 -> /custom/cache/text_embedding/
			
 
				+# 数据缓存 -> /custom/cache/data/search/
			
 
				+
			
 
				+# Windows
			
 
				+set CACHE_ROOT=C:\custom\cache
			
 
				+python your_script.py
			
 
				+```
			
 
				+
			
 
				+### 4. 为单次调用指定缓存目录
			
 
				+
			
 
				+如果只想为特定调用指定缓存目录：
			
 
				+
			
 
				+**计算缓存：**
			
 
				+```python
			
 
				+from lib.text_embedding import compare_phrases
			
 
				+
			
 
				+# 为这次调用指定特殊的缓存目录
			
 
				+result = compare_phrases(
			
 
				+    "深度学习",
			
 
				+    "神经网络",
			
 
				+    cache_dir="/tmp/my_custom_cache"
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+**数据缓存：**
			
 
				+```bash
			
 
				+# 通过命令行参数指定
			
 
				+python script/search/ai_search.py --query "test" --results-dir /custom/output
			
 
				+```
			
 
				+
			
 
				+## 配置优先级
			
 
				+
			
 
				+### 计算缓存优先级
			
 
				+
			
 
				+1. **函数参数 `cache_dir`** - 优先级最高
			
 
				+2. **代码中调用 `set_cache_root()`** - 中等优先级
			
 
				+3. **环境变量 `CACHE_ROOT`** - 较低优先级
			
 
				+4. **默认值 `~/cache`** - 优先级最低
			
 
				+
			
 
				+### 数据缓存优先级
			
 
				+
			
 
				+1. **命令行参数 `--results-dir`** - 优先级最高
			
 
				+2. **代码中调用 `set_cache_root()`** - 中等优先级（影响 ~/cache/data/）
			
 
				+3. **环境变量 `CACHE_ROOT`** - 较低优先级（影响 ~/cache/data/）
			
 
				+4. **默认值 `~/cache/data/`** - 优先级最低
			
 
				+
			
 
				+## 涉及的模块
			
 
				+
			
 
				+### 计算缓存（cache/）
			
 
				+
			
 
				+- **lib/text_embedding.py** - 向量相似度缓存（`cache/text_embedding/`）
			
 
				+- **lib/semantic_similarity.py** - 语义相似度缓存（`cache/semantic_similarity/`）
			
 
				+- **lib/hybrid_similarity.py** - 混合相似度缓存
			
 
				+- **script/analysis/analyze_model_comparison.py** - 模型对比分析
			
 
				+- **script/analysis/test_all_models.py** - 模型测试
			
 
				+
			
 
				+### 数据缓存（cache/data/）
			
 
				+
			
 
				+- **script/search/** - 搜索结果缓存（`cache/data/search/`）
			
 
				+  - ai_search.py, custom_search.py, douyin_search.py, xiaohongshu_search.py
			
 
				+- **script/detail/** - 详情数据缓存（`cache/data/detail/`）
			
 
				+  - xiaohongshu_detail.py
			
 
				+- **script/get_tools_list.py** - 工具列表缓存（`cache/data/tools_list/`）
			
 
				+- **script/search_recommendations/** - 搜索推荐缓存（`cache/data/search_recommendations/`）
			
 
				+- **script/search_tagwords/** - 搜索标签词缓存（`cache/data/search_tagwords/`）
			
 
				+
			
 
				+### 非缓存数据（data/）
			
 
				+
			
 
				+- 账号相关数据（`data/阿里多多酱/`, `data/账号/`）
			
 
				+- 特定日期数据（`data/data_1117/`, `data/data_1118/`等）
			
 
				+- 分析脚本（`data/*.py`）
			
 
				+- 分析结果（`data/*.xlsx`, `data/*.json`）
			
 
				+- 文档（`data/*.md`）
			
 
				+
			
 
				+## 示例代码
			
 
				+
			
 
				+### 示例 1: 使用默认配置
			
 
				+
			
 
				+```python
			
 
				+from lib.text_embedding import compare_phrases
			
 
				+
			
 
				+result = compare_phrases("如何更换花呗绑定银行卡", "花呗更改绑定银行卡")
			
 
				+print(f"相似度: {result['相似度']:.3f}")
			
 
				+# 缓存位置: cache/text_embedding/
			
 
				+```
			
 
				+
			
 
				+### 示例 2: 设置全局缓存根目录
			
 
				+
			
 
				+```python
			
 
				+from lib.config import set_cache_root, get_cache_root
			
 
				+from lib.text_embedding import compare_phrases
			
 
				+from lib.semantic_similarity import compare_phrases as compare_phrases_semantic
			
 
				+import asyncio
			
 
				+
			
 
				+# 设置全局缓存根目录
			
 
				+set_cache_root("/path/to/custom/cache")
			
 
				+
			
 
				+print(f"当前缓存根目录: {get_cache_root()}")
			
 
				+# 输出: /path/to/custom/cache
			
 
				+
			
 
				+# 所有模块都会使用新的缓存路径
			
 
				+result1 = compare_phrases("深度学习", "神经网络")
			
 
				+# 缓存位置: /path/to/custom/cache/text_embedding/
			
 
				+
			
 
				+result2 = asyncio.run(compare_phrases_semantic("深度学习", "神经网络"))
			
 
				+# 缓存位置: /path/to/custom/cache/semantic_similarity/
			
 
				+```
			
 
				+
			
 
				+### 示例 3: 使用环境变量
			
 
				+
			
 
				+```python
			
 
				+# 在运行脚本前设置环境变量
			
 
				+# export CACHE_ROOT=/Users/semsevens/Desktop/workspace/daily/1113/how_1120_v3/cache
			
 
				+
			
 
				+from lib.config import get_cache_root
			
 
				+from lib.text_embedding import compare_phrases
			
 
				+
			
 
				+print(f"缓存根目录: {get_cache_root()}")
			
 
				+# 输出: /Users/semsevens/Desktop/workspace/daily/1113/how_1120_v3/cache
			
 
				+
			
 
				+result = compare_phrases("测试", "示例")
			
 
				+# 缓存位置: /Users/semsevens/Desktop/workspace/daily/1113/how_1120_v3/cache/text_embedding/
			
 
				+```
			
 
				+
			
 
				+### 示例 4: 混合相似度模块配置
			
 
				+
			
 
				+```python
			
 
				+from lib.hybrid_similarity import compare_phrases
			
 
				+from lib.config import set_cache_root
			
 
				+import asyncio
			
 
				+
			
 
				+# 方式1: 使用全局配置
			
 
				+set_cache_root("/custom/cache")
			
 
				+result = asyncio.run(compare_phrases("深度学习", "神经网络"))
			
 
				+# 向量模型缓存: /custom/cache/text_embedding/
			
 
				+# 语义模型缓存: /custom/cache/semantic_similarity/
			
 
				+
			
 
				+# 方式2: 分别指定缓存目录
			
 
				+result = asyncio.run(compare_phrases(
			
 
				+    "深度学习",
			
 
				+    "神经网络",
			
 
				+    cache_dir_embedding="/path/to/embedding/cache",
			
 
				+    cache_dir_semantic="/path/to/semantic/cache"
			
 
				+))
			
 
				+```
			
 
				+
			
 
				+### 示例 5: 在脚本中使用
			
 
				+
			
 
				+```python
			
 
				+# script/my_analysis.py
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加项目根目录到路径
			
 
				+project_root = Path(__file__).parent.parent
			
 
				+sys.path.insert(0, str(project_root))
			
 
				+
			
 
				+from lib.config import set_cache_root, get_cache_dir
			
 
				+
			
 
				+# 设置缓存根目录
			
 
				+set_cache_root("/Users/semsevens/Desktop/workspace/daily/1113/how_1120_v3/cache")
			
 
				+
			
 
				+# 获取特定模块的缓存目录
			
 
				+text_embedding_cache = get_cache_dir("text_embedding")
			
 
				+semantic_similarity_cache = get_cache_dir("semantic_similarity")
			
 
				+
			
 
				+print(f"向量模型缓存: {text_embedding_cache}")
			
 
				+print(f"语义模型缓存: {semantic_similarity_cache}")
			
 
				+```
			
 
				+
			
 
				+## API 参考
			
 
				+
			
 
				+### lib.config 模块
			
 
				+
			
 
				+#### 缓存路径相关
			
 
				+
			
 
				+##### `get_cache_root() -> str`
			
 
				+获取当前的缓存根目录。
			
 
				+
			
 
				+##### `set_cache_root(path: str) -> None`
			
 
				+设置缓存根目录。
			
 
				+
			
 
				+**参数:**
			
 
				+- `path`: 缓存根目录路径（可以是绝对路径或相对路径）
			
 
				+
			
 
				+##### `get_cache_dir(subdir: str) -> str`
			
 
				+获取特定子模块的缓存目录。
			
 
				+
			
 
				+**参数:**
			
 
				+- `subdir`: 子目录名称，如 `"text_embedding"`, `"semantic_similarity"`
			
 
				+
			
 
				+**返回:**
			
 
				+- 完整的缓存目录路径
			
 
				+
			
 
				+#### 数据路径相关
			
 
				+
			
 
				+##### `get_data_root() -> str`
			
 
				+获取当前的数据根目录。
			
 
				+
			
 
				+##### `set_data_root(path: str) -> None`
			
 
				+设置数据根目录。
			
 
				+
			
 
				+**参数:**
			
 
				+- `path`: 数据根目录路径（可以是绝对路径或相对路径）
			
 
				+
			
 
				+##### `get_data_dir(subdir: str = "") -> str`
			
 
				+获取特定子模块的数据目录。
			
 
				+
			
 
				+**参数:**
			
 
				+- `subdir`: 子目录名称，如 `"search"`, `"detail"`, `"tools_list"` 等。如果为空字符串，返回数据根目录
			
 
				+
			
 
				+**返回:**
			
 
				+- 完整的数据目录路径
			
 
				+
			
 
				+## 注意事项
			
 
				+
			
 
				+1. **路径格式**: 支持绝对路径和相对路径，相对路径相对于当前工作目录
			
 
				+2. **自动创建**: 缓存目录会在首次写入时自动创建
			
 
				+3. **线程安全**: 配置模块是线程安全的，可以在多线程环境中使用
			
 
				+4. **环境变量优先级**: 如果同时设置了环境变量和代码配置，代码配置优先级更高
			
 
				+
			
 
				+## 迁移指南
			
 
				+
			
 
				+如果你之前使用的是硬编码的缓存路径（如 `/Users/semsevens/Desktop/workspace/daily/1113/how_1120_v3/cache`），现在可以：
			
 
				+
			
 
				+### 方式 1: 设置环境变量（推荐）
			
 
				+
			
 
				+```bash
			
 
				+export CACHE_ROOT=/Users/semsevens/Desktop/workspace/daily/1113/how_1120_v3/cache
			
 
				+```
			
 
				+
			
 
				+然后正常运行你的脚本，无需修改代码。
			
 
				+
			
 
				+### 方式 2: 在代码开头设置
			
 
				+
			
 
				+在你的脚本开头添加：
			
 
				+
			
 
				+```python
			
 
				+from lib.config import set_cache_root
			
 
				+
			
 
				+set_cache_root("/Users/semsevens/Desktop/workspace/daily/1113/how_1120_v3/cache")
			
 
				+```
			
 
				+
			
 
				+### 方式 3: 使用相对路径
			
 
				+
			
 
				+如果你想让缓存路径相对于项目目录：
			
 
				+
			
 
				+```python
			
 
				+from lib.config import set_cache_root
			
 
				+from pathlib import Path
			
 
				+
			
 
				+project_root = Path(__file__).parent.parent
			
 
				+cache_path = project_root / "cache"
			
 
				+set_cache_root(str(cache_path))
			
 
				+```
			
 
				+
			
 
				+## 常见问题
			
 
				+
			
 
				+**Q: 我可以为不同的模块使用不同的缓存根目录吗？**
			
 
				+
			
 
				+A: 目前不支持。所有模块共享同一个缓存根目录，但你可以在调用时使用 `cache_dir` 参数为单次调用指定不同的路径。
			
 
				+
			
 
				+**Q: 修改缓存路径后，旧的缓存文件会自动迁移吗？**
			
 
				+
			
 
				+A: 不会。你需要手动移动缓存文件到新的目录，或者让程序重新生成缓存。
			
 
				+
			
 
				+**Q: 如何清空缓存？**
			
 
				+
			
 
				+A: 直接删除缓存目录即可：`rm -rf cache/text_embedding/*` 或 `rm -rf cache/semantic_similarity/*`
			
 
				+
			
 
				+**Q: 缓存文件占用空间过大怎么办？**
			
 
				+
			
 
				+A: 可以定期清理旧的缓存文件，或者设置缓存到临时目录（如 `/tmp/cache`）。
			
 
				+
			
 
				+**Q: cache/ 和 data/ 目录有什么区别？**
			
 
				+
			
 
				+A:
			
 
				+- **cache/**: 所有可以重新生成的缓存数据
			
 
				+  - `cache/text_embedding/` - 计算缓存
			
 
				+  - `cache/semantic_similarity/` - 计算缓存
			
 
				+  - `cache/data/` - 数据缓存（爬虫、工具列表等）
			
 
				+- **data/**: 不可重新生成的项目数据
			
 
				+  - 账号数据、特定日期的分析结果、文档等
			
 
				+
			
 
				+**Q: 为什么 cache/data/ 也叫缓存？**
			
 
				+
			
 
				+A: 因为爬虫采集的数据（search、detail、tools_list）都可以通过重新运行脚本获取，本质上是可重新生成的缓存数据。统一放在 cache/ 下便于管理和清理。
			
--- a/CACHE_LOCATION.md
+++ b/CACHE_LOCATION.md
@@ -0,0 +1,53 @@
 
				+# 缓存位置说明
			
 
				+
			
 
				+本项目的缓存默认存储在用户主目录：
			
 
				+
			
 
				+**默认缓存位置：** `~/cache` (即 `/Users/semsevens/cache`)
			
 
				+
			
 
				+## 目录结构
			
 
				+
			
 
				+```
			
 
				+~/cache/
			
 
				+├── text_embedding/            # 向量相似度计算缓存
			
 
				+├── semantic_similarity/       # 语义相似度计算缓存
			
 
				+└── data/                      # 数据缓存
			
 
				+    ├── search/                # 搜索结果缓存
			
 
				+    ├── detail/                # 详情数据缓存
			
 
				+    └── tools_list/            # 工具列表缓存
			
 
				+```
			
 
				+
			
 
				+## 配置方式
			
 
				+
			
 
				+### 1. 默认配置（推荐）
			
 
				+
			
 
				+无需任何配置，代码默认使用 `~/cache` 作为缓存根目录。
			
 
				+
			
 
				+### 2. 通过环境变量覆盖
			
 
				+
			
 
				+如果需要使用其他路径，可以设置环境变量：
			
 
				+
			
 
				+编辑 `~/.zshrc`，添加：
			
 
				+```bash
			
 
				+export CACHE_ROOT=/your/custom/path
			
 
				+```
			
 
				+
			
 
				+然后执行：
			
 
				+```bash
			
 
				+source ~/.zshrc
			
 
				+```
			
 
				+
			
 
				+### 3. 通过代码设置
			
 
				+
			
 
				+在程序开始时调用：
			
 
				+```python
			
 
				+from lib.config import set_cache_root
			
 
				+set_cache_root("/your/custom/path")
			
 
				+```
			
 
				+
			
 
				+## 配置优先级
			
 
				+
			
 
				+1. **代码中调用 `set_cache_root()`** - 最高优先级
			
 
				+2. **环境变量 `CACHE_ROOT`** - 中等优先级
			
 
				+3. **默认值 `~/cache`** - 最低优先级
			
 
				+
			
 
				+参考 `CACHE_CONFIG.md` 了解更多配置方式。
			
--- a/CACHE_MIGRATION_SUMMARY.md
+++ b/CACHE_MIGRATION_SUMMARY.md
@@ -0,0 +1,144 @@
 
				+# 缓存迁移总结
			
 
				+
			
 
				+## 问题发现
			
 
				+
			
 
				+在检查缓存时发现了一个孤立的缓存目录：
			
 
				+
			
 
				+**位置：** `/Users/semsevens/Desktop/workspace/daily/1113/how_1121_v2/script/data_processing/cache/`
			
 
				+
			
 
				+**大小：** 105M（包含 26,931 个文件）
			
 
				+
			
 
				+### 原因分析
			
 
				+
			
 
				+这个孤立缓存目录的产生原因：
			
 
				+
			
 
				+1. **相对路径问题**：早期版本的 `lib/text_embedding.py` 和 `lib/semantic_similarity.py` 使用相对路径 `cache/text_embedding/` 作为默认缓存目录
			
 
				+2. **工作目录依赖**：当从 `script/data_processing/` 目录运行脚本时，缓存会在当前工作目录下创建
			
 
				+3. **结果**：在 `script/data_processing/cache/` 下生成了大量缓存文件
			
 
				+
			
 
				+## 已执行的迁移操作
			
 
				+
			
 
				+### 1. 迁移孤立缓存到统一位置
			
 
				+
			
 
				+```bash
			
 
				+# 合并 text_embedding 缓存（26,931 个文件）
			
 
				+rsync -av script/data_processing/cache/text_embedding/ ~/cache/text_embedding/
			
 
				+
			
 
				+# 合并 semantic_similarity 缓存
			
 
				+cp -R script/data_processing/cache/semantic_similarity/* ~/cache/semantic_similarity/
			
 
				+
			
 
				+# 删除孤立缓存目录
			
 
				+rm -rf script/data_processing/cache/
			
 
				+```
			
 
				+
			
 
				+### 2. 验证无其他孤立缓存
			
 
				+
			
 
				+```bash
			
 
				+find /Users/semsevens/Desktop/workspace/daily/1113/how_1121_v2 -type d -name "cache"
			
 
				+# 结果：无输出，确认项目中已无其他孤立缓存目录
			
 
				+```
			
 
				+
			
 
				+## 最终缓存状态
			
 
				+
			
 
				+### 统一缓存位置：`~/cache`
			
 
				+
			
 
				+```
			
 
				+~/cache/
			
 
				+├── text_embedding/          # 105M (26,981 files) - 向量相似度缓存
			
 
				+├── semantic_similarity/     # 122M (31,307 files) - 语义相似度缓存
			
 
				+├── data/                    # 1.5M - 数据缓存
			
 
				+│   ├── search/              # 搜索结果缓存
			
 
				+│   ├── detail/              # 详情数据缓存
			
 
				+│   └── tools_list/          # 工具列表缓存
			
 
				+└── xhs_blogger/            # 516K - 博主数据缓存
			
 
				+```
			
 
				+
			
 
				+### 总缓存大小
			
 
				+
			
 
				+- **text_embedding**: 105M
			
 
				+- **semantic_similarity**: 122M
			
 
				+- **data**: 1.5M
			
 
				+- **xhs_blogger**: 516K
			
 
				+- **总计**: ~229M
			
 
				+
			
 
				+## 预防措施
			
 
				+
			
 
				+### 1. 环境变量配置
			
 
				+
			
 
				+已在 `~/.zshrc` 中设置：
			
 
				+
			
 
				+```bash
			
 
				+export CACHE_ROOT=~/cache
			
 
				+```
			
 
				+
			
 
				+这确保了所有新的缓存都会统一存储在 `~/cache/` 目录下。
			
 
				+
			
 
				+### 2. 代码改进
			
 
				+
			
 
				+- `lib/config.py` 提供统一的缓存路径管理
			
 
				+- 所有模块使用 `get_cache_dir()` 获取缓存路径
			
 
				+- 优先级：环境变量 > 代码设置 > 默认值
			
 
				+
			
 
				+### 3. 运行脚本的最佳实践
			
 
				+
			
 
				+**推荐做法**：
			
 
				+```bash
			
 
				+# 从项目根目录运行
			
 
				+cd /Users/semsevens/Desktop/workspace/daily/1113/how_1121_v2
			
 
				+python script/data_processing/your_script.py
			
 
				+```
			
 
				+
			
 
				+**避免做法**：
			
 
				+```bash
			
 
				+# 不要从 script/data_processing/ 目录运行
			
 
				+cd script/data_processing
			
 
				+python your_script.py  # 可能在当前目录创建缓存
			
 
				+```
			
 
				+
			
 
				+## 验证步骤
			
 
				+
			
 
				+如果想验证缓存配置是否正确：
			
 
				+
			
 
				+```python
			
 
				+from lib.config import get_cache_root, get_cache_dir
			
 
				+
			
 
				+# 检查缓存根目录
			
 
				+print(f"缓存根目录: {get_cache_root()}")
			
 
				+# 应输出: /Users/semsevens/cache
			
 
				+
			
 
				+# 检查具体模块的缓存目录
			
 
				+print(f"text_embedding 缓存: {get_cache_dir('text_embedding')}")
			
 
				+# 应输出: /Users/semsevens/cache/text_embedding
			
 
				+
			
 
				+print(f"semantic_similarity 缓存: {get_cache_dir('semantic_similarity')}")
			
 
				+# 应输出: /Users/semsevens/cache/semantic_similarity
			
 
				+```
			
 
				+
			
 
				+## 清理建议
			
 
				+
			
 
				+定期检查缓存大小：
			
 
				+
			
 
				+```bash
			
 
				+# 查看缓存大小
			
 
				+du -sh ~/cache/*/
			
 
				+
			
 
				+# 如果需要清理旧缓存
			
 
				+rm -rf ~/cache/text_embedding/*
			
 
				+rm -rf ~/cache/semantic_similarity/*
			
 
				+```
			
 
				+
			
 
				+## 总结
			
 
				+
			
 
				+✅ **已完成：**
			
 
				+- 发现并迁移了 105M 的孤立缓存
			
 
				+- 统一所有缓存到 `~/cache/`
			
 
				+- 验证项目中无其他孤立缓存
			
 
				+- 确认环境变量配置正确
			
 
				+
			
 
				+✅ **已预防：**
			
 
				+- 通过环境变量避免相对路径问题
			
 
				+- 通过 `lib/config.py` 统一管理缓存路径
			
 
				+- 文档说明最佳实践
			
 
				+
			
 
				+🎯 **结果：**
			
 
				+所有缓存现在都统一存储在 `~/cache/` 目录下，不会再出现孤立缓存的问题。
			
--- a/lib/config.py
+++ b/lib/config.py
@@ -0,0 +1,189 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+配置模块 - 统一管理项目配置
			
 
				+"""
			
 
				+import os
			
 
				+from pathlib import Path
			
 
				+from typing import Optional
			
 
				+
			
 
				+
			
 
				+class Config:
			
 
				+    """项目配置类"""
			
 
				+
			
 
				+    # 默认缓存根目录（用户主目录下的 cache）
			
 
				+    _DEFAULT_CACHE_ROOT = os.path.expanduser("~/cache")
			
 
				+
			
 
				+    # 缓存根目录
			
 
				+    _cache_root: Optional[str] = None
			
 
				+
			
 
				+    @classmethod
			
 
				+    def get_cache_root(cls) -> str:
			
 
				+        """
			
 
				+        获取缓存根目录
			
 
				+
			
 
				+        Returns:
			
 
				+            缓存根目录路径
			
 
				+        """
			
 
				+        if cls._cache_root is None:
			
 
				+            # 1. 优先从环境变量读取
			
 
				+            cache_root = os.environ.get("CACHE_ROOT")
			
 
				+            if cache_root:
			
 
				+                cls._cache_root = cache_root
			
 
				+            else:
			
 
				+                # 2. 使用默认路径
			
 
				+                cls._cache_root = cls._DEFAULT_CACHE_ROOT
			
 
				+
			
 
				+        return cls._cache_root
			
 
				+
			
 
				+    @classmethod
			
 
				+    def set_cache_root(cls, path: str) -> None:
			
 
				+        """
			
 
				+        设置缓存根目录
			
 
				+
			
 
				+        Args:
			
 
				+            path: 缓存根目录路径（可以是绝对路径或相对路径）
			
 
				+        """
			
 
				+        cls._cache_root = path
			
 
				+
			
 
				+    @classmethod
			
 
				+    def get_cache_dir(cls, subdir: str) -> str:
			
 
				+        """
			
 
				+        获取特定子模块的缓存目录
			
 
				+
			
 
				+        Args:
			
 
				+            subdir: 子目录名称，如：
			
 
				+                - "text_embedding", "semantic_similarity" - 计算缓存
			
 
				+                - "data/search", "data/detail" - 爬虫数据缓存
			
 
				+                - "data/analysis" - 分析结果缓存
			
 
				+
			
 
				+        Returns:
			
 
				+            完整的缓存目录路径
			
 
				+        """
			
 
				+        cache_root = cls.get_cache_root()
			
 
				+        return str(Path(cache_root) / subdir)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def get_data_dir(cls, subdir: str = "") -> str:
			
 
				+        """
			
 
				+        获取数据缓存目录（data 目录现在在缓存根目录下）
			
 
				+
			
 
				+        Args:
			
 
				+            subdir: 子目录名称，如 "search", "detail", "tools_list" 等
			
 
				+                   如果为空字符串，返回 data 根目录
			
 
				+
			
 
				+        Returns:
			
 
				+            完整的数据目录路径
			
 
				+
			
 
				+        Note:
			
 
				+            data 目录现在统一放在缓存根目录下：
			
 
				+            - 默认：cache/data/
			
 
				+            - 如果设置了 CACHE_ROOT=/custom: /custom/data/
			
 
				+        """
			
 
				+        cache_root = cls.get_cache_root()
			
 
				+        if subdir:
			
 
				+            return str(Path(cache_root) / "data" / subdir)
			
 
				+        return str(Path(cache_root) / "data")
			
 
				+
			
 
				+    @classmethod
			
 
				+    def reset(cls) -> None:
			
 
				+        """
			
 
				+        重置配置为默认值（主要用于测试）
			
 
				+        """
			
 
				+        cls._cache_root = None
			
 
				+
			
 
				+
			
 
				+# 便捷函数
			
 
				+def get_cache_root() -> str:
			
 
				+    """获取缓存根目录"""
			
 
				+    return Config.get_cache_root()
			
 
				+
			
 
				+
			
 
				+def set_cache_root(path: str) -> None:
			
 
				+    """设置缓存根目录"""
			
 
				+    Config.set_cache_root(path)
			
 
				+
			
 
				+
			
 
				+def get_cache_dir(subdir: str) -> str:
			
 
				+    """获取特定子模块的缓存目录"""
			
 
				+    return Config.get_cache_dir(subdir)
			
 
				+
			
 
				+
			
 
				+def get_data_dir(subdir: str = "") -> str:
			
 
				+    """
			
 
				+    获取数据缓存目录
			
 
				+
			
 
				+    Note: data 目录现在在缓存根目录下，例如 cache/data/
			
 
				+    """
			
 
				+    return Config.get_data_dir(subdir)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    print("=" * 60)
			
 
				+    print("配置模块示例")
			
 
				+    print("=" * 60)
			
 
				+    print()
			
 
				+
			
 
				+    # 示例 1: 使用默认配置
			
 
				+    print("示例 1: 默认配置")
			
 
				+    print(f"缓存根目录: {get_cache_root()}")
			
 
				+    print(f"text_embedding 缓存: {get_cache_dir('text_embedding')}")
			
 
				+    print(f"semantic_similarity 缓存: {get_cache_dir('semantic_similarity')}")
			
 
				+    print()
			
 
				+
			
 
				+    # 示例 2: 自定义缓存根目录
			
 
				+    print("示例 2: 自定义缓存根目录")
			
 
				+    set_cache_root("/tmp/my_cache")
			
 
				+    print(f"缓存根目录: {get_cache_root()}")
			
 
				+    print(f"text_embedding 缓存: {get_cache_dir('text_embedding')}")
			
 
				+    print(f"semantic_similarity 缓存: {get_cache_dir('semantic_similarity')}")
			
 
				+    print()
			
 
				+
			
 
				+    # 示例 3: 使用相对路径
			
 
				+    print("示例 3: 使用相对路径")
			
 
				+    set_cache_root("data/cache")
			
 
				+    print(f"缓存根目录: {get_cache_root()}")
			
 
				+    print(f"text_embedding 缓存: {get_cache_dir('text_embedding')}")
			
 
				+    print()
			
 
				+
			
 
				+    # 示例 4: 通过环境变量设置
			
 
				+    print("示例 4: 通过环境变量设置")
			
 
				+    Config.reset()  # 重置配置
			
 
				+    os.environ["CACHE_ROOT"] = "/Users/semsevens/Desktop/workspace/daily/1113/how_1120_v3/cache"
			
 
				+    print(f"缓存根目录: {get_cache_root()}")
			
 
				+    print(f"text_embedding 缓存: {get_cache_dir('text_embedding')}")
			
 
				+    print()
			
 
				+
			
 
				+    # 示例 5: 数据目录配置（在缓存根目录下）
			
 
				+    print("示例 5: 数据目录配置（在缓存根目录下）")
			
 
				+    Config.reset()  # 重置配置
			
 
				+    print(f"缓存根目录: {get_cache_root()}")
			
 
				+    print(f"data 目录: {get_data_dir()}")
			
 
				+    print(f"search 数据: {get_data_dir('search')}")
			
 
				+    print(f"detail 数据: {get_data_dir('detail')}")
			
 
				+    print()
			
 
				+
			
 
				+    # 示例 6: 设置缓存根目录后，data 也会跟着变
			
 
				+    print("示例 6: 设置缓存根目录后，data 也会跟着变")
			
 
				+    set_cache_root("/custom/cache")
			
 
				+    print(f"缓存根目录: {get_cache_root()}")
			
 
				+    print(f"data 目录: {get_data_dir()}")
			
 
				+    print(f"search 数据: {get_data_dir('search')}")
			
 
				+    print()
			
 
				+
			
 
				+    print("=" * 60)
			
 
				+    print("使用方法:")
			
 
				+    print("-" * 60)
			
 
				+    print("缓存根目录:")
			
 
				+    print("  1. 默认使用 'cache' 目录")
			
 
				+    print("  2. 通过代码设置: set_cache_root('/path/to/cache')")
			
 
				+    print("  3. 通过环境变量: export CACHE_ROOT=/path/to/cache")
			
 
				+    print()
			
 
				+    print("目录结构:")
			
 
				+    print("  cache/")
			
 
				+    print("    ├── text_embedding/          # 向量相似度缓存")
			
 
				+    print("    ├── semantic_similarity/     # 语义相似度缓存")
			
 
				+    print("    └── data/                    # 数据缓存（原 data 目录）")
			
 
				+    print("        ├── search/              # 搜索数据")
			
 
				+    print("        ├── detail/              # 详情数据")
			
 
				+    print("        └── analysis/            # 分析结果")
			
 
				+    print("=" * 60)
			
--- a/lib/hybrid_similarity.py
+++ b/lib/hybrid_similarity.py
@@ -8,6 +8,7 @@ from typing import Dict, Any, Optional
 
				 import asyncio
			
 
				 from lib.text_embedding import compare_phrases as compare_phrases_embedding
			
 
				 from lib.semantic_similarity import compare_phrases as compare_phrases_semantic
			
 
				+from lib.config import get_cache_dir
			
 
				 
			
 
				 
			
 
				 async def compare_phrases(
			
@@ -18,8 +19,8 @@ async def compare_phrases(
 
				     embedding_model: str = "chinese",
			
 
				     semantic_model: str = 'openai/gpt-4.1-mini',
			
 
				     use_cache: bool = True,
			
 
				-    cache_dir_embedding: str = "cache/text_embedding",
			
 
				-    cache_dir_semantic: str = "cache/semantic_similarity",
			
 
				+    cache_dir_embedding: Optional[str] = None,
			
 
				+    cache_dir_semantic: Optional[str] = None,
			
 
				     **semantic_kwargs
			
 
				 ) -> Dict[str, Any]:
			
 
				     """
			
@@ -33,8 +34,8 @@ async def compare_phrases(
 
				         embedding_model: 向量模型名称，默认 "chinese"
			
 
				         semantic_model: LLM模型名称，默认 'openai/gpt-4.1-mini'
			
 
				         use_cache: 是否使用缓存，默认 True
			
 
				-        cache_dir_embedding: 向量模型缓存目录
			
 
				-        cache_dir_semantic: LLM模型缓存目录
			
 
				+        cache_dir_embedding: 向量模型缓存目录，默认从配置读取
			
 
				+        cache_dir_semantic: LLM模型缓存目录，默认从配置读取
			
 
				         **semantic_kwargs: 其他传递给semantic_similarity的参数
			
 
				             - temperature: 温度参数，默认 0.0
			
 
				             - max_tokens: 最大token数，默认 65536
			
@@ -74,6 +75,12 @@ async def compare_phrases(
 
				     if abs(total_weight - 1.0) > 0.001:
			
 
				         raise ValueError(f"权重之和必须为1.0，当前为: {total_weight}")
			
 
				 
			
 
				+    # 使用配置的缓存目录（如果未指定）
			
 
				+    if cache_dir_embedding is None:
			
 
				+        cache_dir_embedding = get_cache_dir("text_embedding")
			
 
				+    if cache_dir_semantic is None:
			
 
				+        cache_dir_semantic = get_cache_dir("semantic_similarity")
			
 
				+
			
 
				     # 并发调用两个模型
			
 
				     embedding_task = asyncio.to_thread(
			
 
				         compare_phrases_embedding,
			
--- a/lib/semantic_similarity.py
+++ b/lib/semantic_similarity.py
@@ -7,6 +7,7 @@
 
				 from agents import Agent, Runner, ModelSettings
			
 
				 from lib.client import get_model
			
 
				 from lib.utils import parse_json_from_text
			
 
				+from lib.config import get_cache_dir
			
 
				 from typing import Dict, Any, Optional
			
 
				 import hashlib
			
 
				 import json
			
@@ -26,8 +27,10 @@ DEFAULT_PROMPT_TEMPLATE = """
 
				 ```
			
 
				 """.strip()
			
 
				 
			
 
				-# 默认缓存目录
			
 
				-DEFAULT_CACHE_DIR = "cache/semantic_similarity"
			
 
				+
			
 
				+def _get_default_cache_dir() -> str:
			
 
				+    """获取默认缓存目录（从配置中读取）"""
			
 
				+    return get_cache_dir("semantic_similarity")
			
 
				 
			
 
				 
			
 
				 def _generate_cache_key(
			
@@ -91,7 +94,7 @@ def _get_cache_filepath(
 
				     phrase_b: str,
			
 
				     model_name: str,
			
 
				     temperature: float,
			
 
				-    cache_dir: str = DEFAULT_CACHE_DIR
			
 
				+    cache_dir: Optional[str] = None
			
 
				 ) -> Path:
			
 
				     """
			
 
				     获取缓存文件路径（可读文件名）
			
@@ -110,6 +113,9 @@ def _get_cache_filepath(
 
				     文件名格式: {phrase_a}_vs_{phrase_b}_{model}_t{temp}_{hash[:8]}.json
			
 
				     示例: 宿命感_vs_余华的小说_gpt-4.1-mini_t0.0_a7f3e2d9.json
			
 
				     """
			
 
				+    if cache_dir is None:
			
 
				+        cache_dir = _get_default_cache_dir()
			
 
				+
			
 
				     # 清理短语和模型名
			
 
				     clean_a = _sanitize_for_filename(phrase_a, max_length=20)
			
 
				     clean_b = _sanitize_for_filename(phrase_b, max_length=20)
			
@@ -136,7 +142,7 @@ def _load_from_cache(
 
				     phrase_b: str,
			
 
				     model_name: str,
			
 
				     temperature: float,
			
 
				-    cache_dir: str = DEFAULT_CACHE_DIR
			
 
				+    cache_dir: Optional[str] = None
			
 
				 ) -> Optional[str]:
			
 
				     """
			
 
				     从缓存加载数据
			
@@ -152,6 +158,9 @@ def _load_from_cache(
 
				     Returns:
			
 
				         缓存的结果字符串，如果不存在则返回 None
			
 
				     """
			
 
				+    if cache_dir is None:
			
 
				+        cache_dir = _get_default_cache_dir()
			
 
				+
			
 
				     cache_file = _get_cache_filepath(cache_key, phrase_a, phrase_b, model_name, temperature, cache_dir)
			
 
				 
			
 
				     # 如果文件不存在，尝试通过哈希匹配查找
			
@@ -187,7 +196,7 @@ def _save_to_cache(
 
				     instructions: str,
			
 
				     tools: str,
			
 
				     result: str,
			
 
				-    cache_dir: str = DEFAULT_CACHE_DIR
			
 
				+    cache_dir: Optional[str] = None
			
 
				 ) -> None:
			
 
				     """
			
 
				     保存数据到缓存
			
@@ -205,6 +214,9 @@ def _save_to_cache(
 
				         result: 结果数据（原始字符串）
			
 
				         cache_dir: 缓存目录
			
 
				     """
			
 
				+    if cache_dir is None:
			
 
				+        cache_dir = _get_default_cache_dir()
			
 
				+
			
 
				     cache_file = _get_cache_filepath(cache_key, phrase_a, phrase_b, model_name, temperature, cache_dir)
			
 
				 
			
 
				     # 确保缓存目录存在
			
@@ -254,7 +266,7 @@ async def _difference_between_phrases(
 
				     tools: list = None,
			
 
				     name: str = "Semantic Similarity Analyzer",
			
 
				     use_cache: bool = True,
			
 
				-    cache_dir: str = DEFAULT_CACHE_DIR
			
 
				+    cache_dir: Optional[str] = None
			
 
				 ) -> str:
			
 
				     """
			
 
				     从语义角度判断两个短语的相似度
			
@@ -277,7 +289,7 @@ async def _difference_between_phrases(
 
				         tools: Agent 可用的工具列表，默认为 []
			
 
				         name: Agent 的名称，默认为 "Semantic Similarity Analyzer"（不参与缓存key构建）
			
 
				         use_cache: 是否使用缓存，默认 True
			
 
				-        cache_dir: 缓存目录，默认 'cache/semantic_similarity'
			
 
				+        cache_dir: 缓存目录，默认从配置读取（可通过 lib.config 设置）
			
 
				 
			
 
				     Returns:
			
 
				         JSON 格式的相似度分析结果字符串
			
@@ -345,13 +357,8 @@ async def _difference_between_phrases(
 
				     result = await Runner.run(agent, input=prompt)
			
 
				     final_output = result.final_output
			
 
				 
			
 
				-    # 保存到缓存
			
 
				-    if use_cache:
			
 
				-        _save_to_cache(
			
 
				-            cache_key, phrase_a, phrase_b, model_name,
			
 
				-            temperature, max_tokens, prompt_template,
			
 
				-            instructions, tools_str, final_output, cache_dir
			
 
				-        )
			
 
				+    # 注意：不在这里缓存，而是在解析成功后缓存
			
 
				+    # 这样可以避免缓存解析失败的响应
			
 
				 
			
 
				     return final_output
			
 
				 
			
@@ -367,7 +374,7 @@ async def _difference_between_phrases_parsed(
 
				     tools: list = None,
			
 
				     name: str = "Semantic Similarity Analyzer",
			
 
				     use_cache: bool = True,
			
 
				-    cache_dir: str = DEFAULT_CACHE_DIR
			
 
				+    cache_dir: Optional[str] = None
			
 
				 ) -> Dict[str, Any]:
			
 
				     """
			
 
				     从语义角度判断两个短语的相似度，并解析返回结果为字典
			
@@ -383,13 +390,16 @@ async def _difference_between_phrases_parsed(
 
				         tools: Agent 可用的工具列表，默认为 []
			
 
				         name: Agent 的名称，默认为 "Semantic Similarity Analyzer"
			
 
				         use_cache: 是否使用缓存，默认 True
			
 
				-        cache_dir: 缓存目录，默认 'cache/semantic_similarity'
			
 
				+        cache_dir: 缓存目录，默认从配置读取（可通过 lib.config 设置）
			
 
				 
			
 
				     Returns:
			
 
				         解析后的字典，包含：
			
 
				         - 说明: 相似度判断的理由
			
 
				         - 相似度: 0-1之间的浮点数
			
 
				 
			
 
				+    Raises:
			
 
				+        ValueError: 当无法解析AI响应为有效JSON时抛出
			
 
				+
			
 
				     Examples:
			
 
				         >>> result = await difference_between_phrases_parsed("宿命感", "余华的小说")
			
 
				         >>> print(result['相似度'])
			
@@ -397,21 +407,68 @@ async def _difference_between_phrases_parsed(
 
				         >>> print(result['说明'])
			
 
				         "两个概念有一定关联..."
			
 
				     """
			
 
				+    # 使用默认模板或自定义模板
			
 
				+    if prompt_template is None:
			
 
				+        prompt_template = DEFAULT_PROMPT_TEMPLATE
			
 
				+
			
 
				+    # 默认tools为空列表
			
 
				+    if tools is None:
			
 
				+        tools = []
			
 
				+
			
 
				+    # 生成缓存键
			
 
				+    tools_str = json.dumps(tools, sort_keys=True) if tools else "[]"
			
 
				+    cache_key = _generate_cache_key(
			
 
				+        phrase_a, phrase_b, model_name, temperature, max_tokens, prompt_template, instructions, tools_str
			
 
				+    )
			
 
				+
			
 
				+    # 尝试从缓存加载
			
 
				+    if use_cache:
			
 
				+        cached_result = _load_from_cache(cache_key, phrase_a, phrase_b, model_name, temperature, cache_dir)
			
 
				+        if cached_result is not None:
			
 
				+            # 缓存命中，直接解析并返回
			
 
				+            parsed_result = parse_json_from_text(cached_result)
			
 
				+            if parsed_result:
			
 
				+                return parsed_result
			
 
				+            # 如果缓存的内容也无法解析，继续执行API调用（可能之前缓存了错误响应）
			
 
				+
			
 
				+    # 调用AI获取原始响应（不传use_cache，因为我们在这里手动处理缓存）
			
 
				     raw_result = await _difference_between_phrases(
			
 
				         phrase_a, phrase_b, model_name, temperature, max_tokens,
			
 
				-        prompt_template, instructions, tools, name, use_cache, cache_dir
			
 
				+        prompt_template, instructions, tools, name, use_cache=False, cache_dir=cache_dir
			
 
				     )
			
 
				 
			
 
				     # 使用 utils.parse_json_from_text 解析结果
			
 
				     parsed_result = parse_json_from_text(raw_result)
			
 
				 
			
 
				-    # 如果解析失败（返回空字典），返回带错误信息的结果
			
 
				+    # 如果解析失败（返回空字典），抛出异常并包含详细信息
			
 
				     if not parsed_result:
			
 
				-        return {
			
 
				-            "说明": "解析失败: 无法从响应中提取有效的 JSON",
			
 
				-            "相似度": 0.0,
			
 
				-            "raw_response": raw_result
			
 
				-        }
			
 
				+        # 格式化prompt用于错误信息
			
 
				+        formatted_prompt = prompt_template.format(phrase_a=phrase_a, phrase_b=phrase_b)
			
 
				+
			
 
				+        error_msg = f"""
			
 
				+JSON解析失败！
			
 
				+================================================================================
			
 
				+短语A: {phrase_a}
			
 
				+短语B: {phrase_b}
			
 
				+模型: {model_name}
			
 
				+温度: {temperature}
			
 
				+================================================================================
			
 
				+Prompt:
			
 
				+{formatted_prompt}
			
 
				+================================================================================
			
 
				+AI响应 (长度: {len(raw_result)}):
			
 
				+{raw_result}
			
 
				+================================================================================
			
 
				+"""
			
 
				+        raise ValueError(error_msg)
			
 
				+
			
 
				+    # 只有解析成功后才缓存
			
 
				+    if use_cache:
			
 
				+        _save_to_cache(
			
 
				+            cache_key, phrase_a, phrase_b, model_name,
			
 
				+            temperature, max_tokens, prompt_template,
			
 
				+            instructions, tools_str, raw_result, cache_dir
			
 
				+        )
			
 
				 
			
 
				     return parsed_result
			
 
				 
			
@@ -430,7 +487,7 @@ async def compare_phrases(
 
				     tools: list = None,
			
 
				     name: str = "Semantic Similarity Analyzer",
			
 
				     use_cache: bool = True,
			
 
				-    cache_dir: str = DEFAULT_CACHE_DIR
			
 
				+    cache_dir: Optional[str] = None
			
 
				 ) -> Dict[str, Any]:
			
 
				     """
			
 
				     比较两个短语的语义相似度（对外唯一接口）
			
@@ -446,7 +503,7 @@ async def compare_phrases(
 
				         tools: Agent 可用的工具列表，默认为 []
			
 
				         name: Agent 的名称，默认为 "Semantic Similarity Analyzer"
			
 
				         use_cache: 是否使用缓存，默认 True
			
 
				-        cache_dir: 缓存目录，默认 'cache/semantic_similarity'
			
 
				+        cache_dir: 缓存目录，默认从配置读取（可通过 lib.config 设置）
			
 
				 
			
 
				     Returns:
			
 
				         解析后的字典
			
@@ -540,7 +597,7 @@ async def compare_phrases_v2(
 
				     tools: list = None,
			
 
				     name: str = "Advanced Semantic Analyzer",
			
 
				     use_cache: bool = True,
			
 
				-    cache_dir: str = DEFAULT_CACHE_DIR
			
 
				+    cache_dir: Optional[str] = None
			
 
				 ) -> Dict[str, Any]:
			
 
				     """
			
 
				     比较两个短语的语义相似度 - V2 版本（详细分析）
			
@@ -561,7 +618,7 @@ async def compare_phrases_v2(
 
				         tools: Agent 可用的工具列表，默认为 []
			
 
				         name: Agent 的名称，默认 "Advanced Semantic Analyzer"
			
 
				         use_cache: 是否使用缓存，默认 True
			
 
				-        cache_dir: 缓存目录，默认 'cache/semantic_similarity'
			
 
				+        cache_dir: 缓存目录，默认从配置读取（可通过 lib.config 设置）
			
 
				 
			
 
				     Returns:
			
 
				         解析后的字典，包含：
			
--- a/lib/text_embedding.py
+++ b/lib/text_embedding.py
@@ -11,6 +11,8 @@ from pathlib import Path
 
				 from datetime import datetime
			
 
				 import threading
			
 
				 
			
 
				+from .config import get_cache_dir
			
 
				+
			
 
				 # 支持的模型列表
			
 
				 SUPPORTED_MODELS = {
			
 
				     "chinese": "shibing624/text2vec-base-chinese",           # 默认，中文通用
			
@@ -23,8 +25,10 @@ SUPPORTED_MODELS = {
 
				 _similarity_models = {}  # 存储多个模型实例
			
 
				 _model_lock = threading.Lock()  # 线程锁，保护模型加载
			
 
				 
			
 
				-# 默认缓存目录
			
 
				-DEFAULT_CACHE_DIR = "cache/text_embedding"
			
 
				+
			
 
				+def _get_default_cache_dir() -> str:
			
 
				+    """获取默认缓存目录（从配置中读取）"""
			
 
				+    return get_cache_dir("text_embedding")
			
 
				 
			
 
				 
			
 
				 def _generate_cache_key(phrase_a: str, phrase_b: str, model_name: str) -> str:
			
@@ -70,7 +74,7 @@ def _get_cache_filepath(
 
				     phrase_a: str,
			
 
				     phrase_b: str,
			
 
				     model_name: str,
			
 
				-    cache_dir: str = DEFAULT_CACHE_DIR
			
 
				+    cache_dir: Optional[str] = None
			
 
				 ) -> Path:
			
 
				     """
			
 
				     获取缓存文件路径（可读文件名）
			
@@ -87,6 +91,9 @@ def _get_cache_filepath(
 
				 
			
 
				     文件名格式: {phrase_a}_vs_{phrase_b}_{model}_{hash[:8]}.json
			
 
				     """
			
 
				+    if cache_dir is None:
			
 
				+        cache_dir = _get_default_cache_dir()
			
 
				+
			
 
				     # 清理短语和模型名
			
 
				     clean_a = _sanitize_for_filename(phrase_a, max_length=20)
			
 
				     clean_b = _sanitize_for_filename(phrase_b, max_length=20)
			
@@ -109,7 +116,7 @@ def _load_from_cache(
 
				     phrase_a: str,
			
 
				     phrase_b: str,
			
 
				     model_name: str,
			
 
				-    cache_dir: str = DEFAULT_CACHE_DIR
			
 
				+    cache_dir: Optional[str] = None
			
 
				 ) -> Optional[Dict[str, Any]]:
			
 
				     """
			
 
				     从缓存加载数据
			
@@ -124,6 +131,9 @@ def _load_from_cache(
 
				     Returns:
			
 
				         缓存的结果字典，如果不存在则返回 None
			
 
				     """
			
 
				+    if cache_dir is None:
			
 
				+        cache_dir = _get_default_cache_dir()
			
 
				+
			
 
				     cache_file = _get_cache_filepath(cache_key, phrase_a, phrase_b, model_name, cache_dir)
			
 
				 
			
 
				     # 如果文件不存在，尝试通过哈希匹配查找
			
@@ -153,7 +163,7 @@ def _save_to_cache(
 
				     phrase_b: str,
			
 
				     model_name: str,
			
 
				     result: Dict[str, Any],
			
 
				-    cache_dir: str = DEFAULT_CACHE_DIR
			
 
				+    cache_dir: Optional[str] = None
			
 
				 ) -> None:
			
 
				     """
			
 
				     保存数据到缓存
			
@@ -166,6 +176,9 @@ def _save_to_cache(
 
				         result: 结果数据（字典格式）
			
 
				         cache_dir: 缓存目录
			
 
				     """
			
 
				+    if cache_dir is None:
			
 
				+        cache_dir = _get_default_cache_dir()
			
 
				+
			
 
				     cache_file = _get_cache_filepath(cache_key, phrase_a, phrase_b, model_name, cache_dir)
			
 
				 
			
 
				     # 确保缓存目录存在
			
@@ -237,7 +250,7 @@ def compare_phrases(
 
				     phrase_b: str,
			
 
				     model_name: str = "chinese",
			
 
				     use_cache: bool = True,
			
 
				-    cache_dir: str = DEFAULT_CACHE_DIR
			
 
				+    cache_dir: Optional[str] = None
			
 
				 ) -> Dict[str, Any]:
			
 
				     """
			
 
				     比较两个短语的语义相似度（兼容 semantic_similarity.py 的接口）
			
@@ -264,7 +277,7 @@ def compare_phrases(
 
				             - "shibing624/text2vec-base-chinese-paraphrase"
			
 
				             - "shibing624/text2vec-base-chinese-sentence"
			
 
				         use_cache: 是否使用缓存，默认 True
			
 
				-        cache_dir: 缓存目录，默认 'cache/text_embedding'
			
 
				+        cache_dir: 缓存目录，默认从配置读取（可通过 lib.config 设置）
			
 
				 
			
 
				     Returns:
			
 
				         {
			
@@ -285,7 +298,13 @@ def compare_phrases(
 
				 
			
 
				         >>> # 禁用缓存
			
 
				         >>> result = compare_phrases("测试", "测试", use_cache=False)
			
 
				+
			
 
				+        >>> # 自定义缓存目录
			
 
				+        >>> result = compare_phrases("测试1", "测试2", cache_dir="/tmp/my_cache")
			
 
				     """
			
 
				+    if cache_dir is None:
			
 
				+        cache_dir = _get_default_cache_dir()
			
 
				+
			
 
				     # 转换简称为完整名称（用于缓存键）
			
 
				     full_model_name = SUPPORTED_MODELS.get(model_name, model_name)
			
 
				 
			
--- a/migrate_data_to_cache.py
+++ b/migrate_data_to_cache.py
@@ -0,0 +1,151 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+数据目录迁移脚本
			
 
				+
			
 
				+将 data/ 目录迁移到 cache/data/ 目录下，实现统一的缓存管理。
			
 
				+
			
 
				+注意：
			
 
				+- 会保留 data/ 目录中的非缓存文件（如文档、配置等）
			
 
				+- 只移动缓存性质的数据（爬虫数据、分析结果等）
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import shutil
			
 
				+from pathlib import Path
			
 
				+
			
 
				+
			
 
				+def migrate_data_to_cache():
			
 
				+    """将 data 目录迁移到 cache/data"""
			
 
				+
			
 
				+    project_root = Path(__file__).parent
			
 
				+    old_data_dir = project_root / "data"
			
 
				+    new_data_dir = project_root / "cache" / "data"
			
 
				+
			
 
				+    print("=" * 60)
			
 
				+    print("数据目录迁移脚本")
			
 
				+    print("=" * 60)
			
 
				+    print(f"源目录: {old_data_dir}")
			
 
				+    print(f"目标目录: {new_data_dir}")
			
 
				+    print()
			
 
				+
			
 
				+    # 检查源目录是否存在
			
 
				+    if not old_data_dir.exists():
			
 
				+        print("✓ data/ 目录不存在，无需迁移")
			
 
				+        return
			
 
				+
			
 
				+    # 确保目标目录的父目录存在
			
 
				+    new_data_dir.parent.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    # 如果目标目录已存在，询问是否覆盖
			
 
				+    if new_data_dir.exists():
			
 
				+        print(f"⚠️  目标目录已存在: {new_data_dir}")
			
 
				+        response = input("是否继续？这将合并两个目录的内容 (y/n): ")
			
 
				+        if response.lower() != 'y':
			
 
				+            print("取消迁移")
			
 
				+            return
			
 
				+
			
 
				+    print("开始迁移...")
			
 
				+    print()
			
 
				+
			
 
				+    # 统计信息
			
 
				+    total_files = 0
			
 
				+    total_dirs = 0
			
 
				+    skipped_items = []
			
 
				+
			
 
				+    # 需要跳过的目录和文件（非缓存数据）
			
 
				+    skip_patterns = {
			
 
				+        '.git',
			
 
				+        '.DS_Store',
			
 
				+        '__pycache__',
			
 
				+        '*.md',  # 文档文件
			
 
				+        '*.txt', # 说明文件
			
 
				+        'README*',
			
 
				+    }
			
 
				+
			
 
				+    # 遍历源目录
			
 
				+    for item in old_data_dir.iterdir():
			
 
				+        item_name = item.name
			
 
				+
			
 
				+        # 检查是否应该跳过
			
 
				+        should_skip = False
			
 
				+        for pattern in skip_patterns:
			
 
				+            if pattern.startswith('*'):
			
 
				+                if item_name.endswith(pattern[1:]):
			
 
				+                    should_skip = True
			
 
				+                    break
			
 
				+            elif pattern.endswith('*'):
			
 
				+                if item_name.startswith(pattern[:-1]):
			
 
				+                    should_skip = True
			
 
				+                    break
			
 
				+            elif item_name == pattern:
			
 
				+                should_skip = True
			
 
				+                break
			
 
				+
			
 
				+        if should_skip:
			
 
				+            skipped_items.append(item_name)
			
 
				+            print(f"⊘ 跳过: {item_name} (非缓存数据)")
			
 
				+            continue
			
 
				+
			
 
				+        # 目标路径
			
 
				+        target = new_data_dir / item_name
			
 
				+
			
 
				+        try:
			
 
				+            if item.is_dir():
			
 
				+                # 如果目标已存在，合并；否则直接移动
			
 
				+                if target.exists():
			
 
				+                    print(f"→ 合并目录: {item_name}/")
			
 
				+                    # 递归复制并删除源
			
 
				+                    shutil.copytree(item, target, dirs_exist_ok=True)
			
 
				+                    shutil.rmtree(item)
			
 
				+                else:
			
 
				+                    print(f"→ 移动目录: {item_name}/")
			
 
				+                    shutil.move(str(item), str(target))
			
 
				+                total_dirs += 1
			
 
				+            else:
			
 
				+                # 文件直接移动
			
 
				+                if target.exists():
			
 
				+                    print(f"→ 覆盖文件: {item_name}")
			
 
				+                else:
			
 
				+                    print(f"→ 移动文件: {item_name}")
			
 
				+                shutil.move(str(item), str(target))
			
 
				+                total_files += 1
			
 
				+        except Exception as e:
			
 
				+            print(f"✗ 移动失败: {item_name}")
			
 
				+            print(f"  错误: {e}")
			
 
				+
			
 
				+    print()
			
 
				+    print("=" * 60)
			
 
				+    print("迁移完成！")
			
 
				+    print("=" * 60)
			
 
				+    print(f"移动的目录数: {total_dirs}")
			
 
				+    print(f"移动的文件数: {total_files}")
			
 
				+    if skipped_items:
			
 
				+        print(f"跳过的项目数: {len(skipped_items)}")
			
 
				+        print(f"跳过的项目: {', '.join(skipped_items)}")
			
 
				+    print()
			
 
				+
			
 
				+    # 检查 data 目录是否为空
			
 
				+    remaining_items = list(old_data_dir.iterdir())
			
 
				+    if not remaining_items:
			
 
				+        print(f"✓ 源目录 {old_data_dir} 已空，可以删除")
			
 
				+        response = input("是否删除空的 data/ 目录？(y/n): ")
			
 
				+        if response.lower() == 'y':
			
 
				+            old_data_dir.rmdir()
			
 
				+            print(f"✓ 已删除 {old_data_dir}")
			
 
				+    else:
			
 
				+        print(f"⚠️  源目录 {old_data_dir} 中还有以下内容：")
			
 
				+        for item in remaining_items:
			
 
				+            print(f"  - {item.name}")
			
 
				+        print("这些可能是非缓存数据，已保留在原位置")
			
 
				+
			
 
				+    print()
			
 
				+    print("=" * 60)
			
 
				+    print("下一步操作：")
			
 
				+    print("1. 验证迁移结果：检查 cache/data/ 目录中的数据是否完整")
			
 
				+    print("2. 如果一切正常，可以删除旧的 data/ 目录（如果为空）")
			
 
				+    print("3. 更新你的脚本或配置，使用新的路径")
			
 
				+    print("=" * 60)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    migrate_data_to_cache()
			
--- a/script/analysis/analyze_model_comparison.py
+++ b/script/analysis/analyze_model_comparison.py
@@ -18,6 +18,8 @@ import pandas as pd
 
				 from datetime import datetime
			
 
				 
			
 
				 # 添加项目根目录到路径
			
 
				+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
			
 
				+from lib.config import get_cache_dir
			
 
				 project_root = Path(__file__).parent.parent.parent
			
 
				 sys.path.insert(0, str(project_root))
			
 
				 
			
@@ -237,9 +239,9 @@ def export_to_excel(
 
				 
			
 
				 def main():
			
 
				     """主函数"""
			
 
				-    # 配置参数
			
 
				-    text_embedding_cache = "cache/text_embedding"
			
 
				-    semantic_similarity_cache = "cache/semantic_similarity"
			
 
				+    # 配置参数（从配置模块获取）
			
 
				+    text_embedding_cache = get_cache_dir("text_embedding")
			
 
				+    semantic_similarity_cache = get_cache_dir("semantic_similarity")
			
 
				     output_file = f"data/model_comparison_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
			
 
				 
			
 
				     print("=" * 60)
			
--- a/script/analysis/test_all_models.py
+++ b/script/analysis/test_all_models.py
@@ -19,6 +19,8 @@ from datetime import datetime
 
				 project_root = Path(__file__).parent.parent.parent
			
 
				 sys.path.insert(0, str(project_root))
			
 
				 
			
 
				+from lib.config import get_cache_dir
			
 
				+
			
 
				 from lib.text_embedding import compare_phrases, SUPPORTED_MODELS
			
 
				 
			
 
				 # 全局并发限制
			
@@ -88,7 +90,7 @@ def get_semaphore():
 
				 
			
 
				 
			
 
				 def extract_test_cases_from_cache(
			
 
				-    cache_dir: str = "cache/text_embedding"
			
 
				+    cache_dir: str = None
			
 
				 ) -> List[Tuple[str, str]]:
			
 
				     """
			
 
				     从现有缓存文件中提取所有测试用例
			
@@ -99,6 +101,9 @@ def extract_test_cases_from_cache(
 
				     Returns:
			
 
				         测试用例列表，每项为 (phrase_a, phrase_b) 元组
			
 
				     """
			
 
				+    if cache_dir is None:
			
 
				+        cache_dir = get_cache_dir("text_embedding")
			
 
				+
			
 
				     cache_path = Path(cache_dir)
			
 
				 
			
 
				     if not cache_path.exists():
			
@@ -304,8 +309,8 @@ def save_results(
 
				 
			
 
				 async def main():
			
 
				     """主函数"""
			
 
				-    # 配置参数
			
 
				-    cache_dir = "cache/text_embedding"
			
 
				+    # 配置参数（从配置模块获取）
			
 
				+    cache_dir = get_cache_dir("text_embedding")
			
 
				     output_file = "data/model_comparison_results.json"
			
 
				 
			
 
				     # 步骤 1: 从缓存提取测试用例
			
--- a/script/detail/xiaohongshu_detail.py
+++ b/script/detail/xiaohongshu_detail.py
@@ -13,6 +13,13 @@ import hashlib
 
				 import re
			
 
				 from datetime import datetime
			
 
				 from typing import Dict, Any, Optional
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加项目根目录到路径并导入配置
			
 
				+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
			
 
				+from lib.config import get_data_dir
			
 
				+
			
 
				 from pathlib import Path
			
 
				 
			
 
				 
			
@@ -38,10 +45,9 @@ class XiaohongshuDetail:
 
				         if results_dir:
			
 
				             self.results_base_dir = results_dir
			
 
				         else:
			
 
				-            # 默认使用项目根目录的 data/detail 文件夹
			
 
				-            script_dir = os.path.dirname(os.path.abspath(__file__))
			
 
				-            project_root = os.path.dirname(os.path.dirname(script_dir))
			
 
				-            self.results_base_dir = os.path.join(project_root, "data", "detail")
			
 
				+            # 默认从配置读取
			
 
				+
			
 
				+            self.results_base_dir = get_data_dir("detail")
			
 
				 
			
 
				     def _sanitize_note_id(self, note_id: str) -> str:
			
 
				         """
			
@@ -403,8 +409,8 @@ def main():
 
				     parser.add_argument(
			
 
				         '--results-dir',
			
 
				         type=str,
			
 
				-        default='data/detail',
			
 
				-        help='结果输出目录 (默认: data/detail)'
			
 
				+        default=None,
			
 
				+        help='结果输出目录 (默认: 从配置读取)'
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         '--note-id',
			
--- a/script/get_tools_list.py
+++ b/script/get_tools_list.py
@@ -10,6 +10,13 @@ import os
 
				 import argparse
			
 
				 from datetime import datetime
			
 
				 from typing import Dict, Any
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加项目根目录到路径并导入配置
			
 
				+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
			
 
				+from lib.config import get_data_dir
			
 
				+
			
 
				 
			
 
				 
			
 
				 class ToolsListFetcher:
			
@@ -31,10 +38,8 @@ class ToolsListFetcher:
 
				         if results_dir:
			
 
				             self.results_base_dir = results_dir
			
 
				         else:
			
 
				-            # 默认使用项目根目录的 data/tools_list 文件夹
			
 
				-            script_dir = os.path.dirname(os.path.abspath(__file__))
			
 
				-            project_root = os.path.dirname(script_dir)
			
 
				-            self.results_base_dir = os.path.join(project_root, "data", "tools_list")
			
 
				+            # 默认从配置读取
			
 
				+            self.results_base_dir = get_data_dir("tools_list")
			
 
				 
			
 
				     def get_tools_list(self, timeout: int = 30) -> Dict[str, Any]:
			
 
				         """
			
@@ -94,8 +99,8 @@ def main():
 
				     parser.add_argument(
			
 
				         '--results-dir',
			
 
				         type=str,
			
 
				-        default='data/tools_list',
			
 
				-        help='结果输出目录 (默认: data/tools_list)'
			
 
				+        default=None,
			
 
				+        help='结果输出目录 (默认: 从配置读取)'
			
 
				     )
			
 
				     args = parser.parse_args()
			
 
				 
			
--- a/script/search/ai_search.py
+++ b/script/search/ai_search.py
@@ -10,6 +10,12 @@ import os
 
				 import argparse
			
 
				 from datetime import datetime
			
 
				 from typing import Dict, Any
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加项目根目录到路径并导入配置
			
 
				+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
			
 
				+from lib.config import get_data_dir
			
 
				 
			
 
				 
			
 
				 class AISearch:
			
@@ -31,10 +37,8 @@ class AISearch:
 
				         if results_dir:
			
 
				             self.results_base_dir = results_dir
			
 
				         else:
			
 
				-            # 默认使用项目根目录的 data/search 文件夹
			
 
				-            script_dir = os.path.dirname(os.path.abspath(__file__))
			
 
				-            project_root = os.path.dirname(os.path.dirname(script_dir))
			
 
				-            self.results_base_dir = os.path.join(project_root, "data", "search")
			
 
				+            # 默认从配置读取
			
 
				+            self.results_base_dir = get_data_dir("search")
			
 
				 
			
 
				     def search(self, query: str, timeout: int = 60) -> Dict[str, Any]:
			
 
				         """
			
@@ -109,8 +113,8 @@ def main():
 
				     parser.add_argument(
			
 
				         '--results-dir',
			
 
				         type=str,
			
 
				-        default='data/search',
			
 
				-        help='结果输出目录 (默认: data/search)'
			
 
				+        default=None,
			
 
				+        help='结果输出目录 (默认: 从配置读取，通常为 data/search)'
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         '--query',
			
--- a/script/search/custom_search.py
+++ b/script/search/custom_search.py
@@ -10,6 +10,13 @@ import os
 
				 import argparse
			
 
				 from datetime import datetime
			
 
				 from typing import Dict, Any
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加项目根目录到路径并导入配置
			
 
				+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
			
 
				+from lib.config import get_data_dir
			
 
				+
			
 
				 
			
 
				 
			
 
				 class CustomSearch:
			
@@ -31,10 +38,9 @@ class CustomSearch:
 
				         if results_dir:
			
 
				             self.results_base_dir = results_dir
			
 
				         else:
			
 
				-            # 默认使用项目根目录的 data/search 文件夹
			
 
				-            script_dir = os.path.dirname(os.path.abspath(__file__))
			
 
				-            project_root = os.path.dirname(os.path.dirname(script_dir))
			
 
				-            self.results_base_dir = os.path.join(project_root, "data", "search")
			
 
				+            # 默认从配置读取
			
 
				+
			
 
				+            self.results_base_dir = get_data_dir("search")
			
 
				 
			
 
				     def search(self, keyword: str, platform: str = "google", timeout: int = 30) -> Dict[str, Any]:
			
 
				         """
			
@@ -105,8 +111,8 @@ def main():
 
				     parser.add_argument(
			
 
				         '--results-dir',
			
 
				         type=str,
			
 
				-        default='data/search',
			
 
				-        help='结果输出目录 (默认: data/search)'
			
 
				+        default=None,
			
 
				+        help='结果输出目录 (默认: 从配置读取)'
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         '--keyword',
			
--- a/script/search/douyin_search.py
+++ b/script/search/douyin_search.py
@@ -10,6 +10,13 @@ import os
 
				 import argparse
			
 
				 from datetime import datetime
			
 
				 from typing import Dict, Any
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加项目根目录到路径并导入配置
			
 
				+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
			
 
				+from lib.config import get_data_dir
			
 
				+
			
 
				 
			
 
				 
			
 
				 class DouyinSearch:
			
@@ -32,10 +39,9 @@ class DouyinSearch:
 
				         if results_dir:
			
 
				             self.results_base_dir = results_dir
			
 
				         else:
			
 
				-            # 默认使用项目根目录的 data/search 文件夹
			
 
				-            script_dir = os.path.dirname(os.path.abspath(__file__))
			
 
				-            project_root = os.path.dirname(os.path.dirname(script_dir))
			
 
				-            self.results_base_dir = os.path.join(project_root, "data", "search")
			
 
				+            # 默认从配置读取
			
 
				+
			
 
				+            self.results_base_dir = get_data_dir("search")
			
 
				 
			
 
				     def search(self, keyword: str, timeout: int = 30) -> Dict[str, Any]:
			
 
				         """
			
@@ -103,8 +109,8 @@ def main():
 
				     parser.add_argument(
			
 
				         '--results-dir',
			
 
				         type=str,
			
 
				-        default='data/search',
			
 
				-        help='结果输出目录 (默认: data/search)'
			
 
				+        default=None,
			
 
				+        help='结果输出目录 (默认: 从配置读取)'
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         '--keyword',
			
--- a/script/search/xiaohongshu_search.py
+++ b/script/search/xiaohongshu_search.py
@@ -13,6 +13,13 @@ import hashlib
 
				 import re
			
 
				 from datetime import datetime
			
 
				 from typing import Dict, Any, Optional, Tuple
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加项目根目录到路径并导入配置
			
 
				+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
			
 
				+from lib.config import get_data_dir
			
 
				+
			
 
				 from copy import deepcopy
			
 
				 from pathlib import Path
			
 
				 
			
@@ -39,10 +46,9 @@ class XiaohongshuSearch:
 
				         if results_dir:
			
 
				             self.results_base_dir = results_dir
			
 
				         else:
			
 
				-            # 默认使用项目根目录的 data/search 文件夹
			
 
				-            script_dir = os.path.dirname(os.path.abspath(__file__))
			
 
				-            project_root = os.path.dirname(os.path.dirname(script_dir))
			
 
				-            self.results_base_dir = os.path.join(project_root, "data", "search")
			
 
				+            # 默认从配置读取
			
 
				+
			
 
				+            self.results_base_dir = get_data_dir("search")
			
 
				 
			
 
				     def _sanitize_keyword(self, keyword: str) -> str:
			
 
				         """
			
@@ -480,8 +486,8 @@ def main():
 
				     parser.add_argument(
			
 
				         '--results-dir',
			
 
				         type=str,
			
 
				-        default='data/search',
			
 
				-        help='结果输出目录 (默认: data/search)'
			
 
				+        default=None,
			
 
				+        help='结果输出目录 (默认: 从配置读取)'
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         '--keyword',
			
--- a/script/search_recommendations/bilibili_search_recommendations.py
+++ b/script/search_recommendations/bilibili_search_recommendations.py
@@ -10,6 +10,13 @@ import os
 
				 import argparse
			
 
				 from datetime import datetime
			
 
				 from typing import Dict, Any
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加项目根目录到路径并导入配置
			
 
				+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
			
 
				+from lib.config import get_data_dir
			
 
				+
			
 
				 
			
 
				 
			
 
				 class BilibiliSearchRecommendations:
			
@@ -32,10 +39,9 @@ class BilibiliSearchRecommendations:
 
				         if results_dir:
			
 
				             self.results_base_dir = results_dir
			
 
				         else:
			
 
				-            # 默认使用项目根目录的 data/search_recommendations 文件夹
			
 
				-            script_dir = os.path.dirname(os.path.abspath(__file__))
			
 
				-            project_root = os.path.dirname(os.path.dirname(script_dir))
			
 
				-            self.results_base_dir = os.path.join(project_root, "data", "search_recommendations")
			
 
				+            # 默认从配置读取
			
 
				+
			
 
				+            self.results_base_dir = get_data_dir("search_recommendations")
			
 
				 
			
 
				     def get_recommendations(self, keyword: str, timeout: int = 30) -> Dict[str, Any]:
			
 
				         """
			
@@ -103,8 +109,8 @@ def main():
 
				     parser.add_argument(
			
 
				         '--results-dir',
			
 
				         type=str,
			
 
				-        default='data/search_recommendations',
			
 
				-        help='结果输出目录 (默认: data/search_recommendations)'
			
 
				+        default=None,
			
 
				+        help='结果输出目录 (默认: 从配置读取)'
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         '--keyword',
			
--- a/script/search_recommendations/douyin_search_recommendations.py
+++ b/script/search_recommendations/douyin_search_recommendations.py
@@ -10,6 +10,13 @@ import os
 
				 import argparse
			
 
				 from datetime import datetime
			
 
				 from typing import Dict, Any
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加项目根目录到路径并导入配置
			
 
				+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
			
 
				+from lib.config import get_data_dir
			
 
				+
			
 
				 
			
 
				 
			
 
				 class DouyinSearchRecommendations:
			
@@ -32,10 +39,9 @@ class DouyinSearchRecommendations:
 
				         if results_dir:
			
 
				             self.results_base_dir = results_dir
			
 
				         else:
			
 
				-            # 默认使用项目根目录的 data/search_recommendations 文件夹
			
 
				-            script_dir = os.path.dirname(os.path.abspath(__file__))
			
 
				-            project_root = os.path.dirname(os.path.dirname(script_dir))
			
 
				-            self.results_base_dir = os.path.join(project_root, "data", "search_recommendations")
			
 
				+            # 默认从配置读取
			
 
				+
			
 
				+            self.results_base_dir = get_data_dir("search_recommendations")
			
 
				 
			
 
				     def get_recommendations(self, keyword: str, timeout: int = 30) -> Dict[str, Any]:
			
 
				         """
			
@@ -103,8 +109,8 @@ def main():
 
				     parser.add_argument(
			
 
				         '--results-dir',
			
 
				         type=str,
			
 
				-        default='data/search_recommendations',
			
 
				-        help='结果输出目录 (默认: data/search_recommendations)'
			
 
				+        default=None,
			
 
				+        help='结果输出目录 (默认: 从配置读取)'
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         '--keyword',
			
--- a/script/search_recommendations/xiaohongshu_search_recommendations.py
+++ b/script/search_recommendations/xiaohongshu_search_recommendations.py
@@ -13,6 +13,13 @@ import ast
 
				 import hashlib
			
 
				 from datetime import datetime
			
 
				 from typing import Dict, Any, Optional
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加项目根目录到路径并导入配置
			
 
				+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
			
 
				+from lib.config import get_data_dir
			
 
				+
			
 
				 
			
 
				 
			
 
				 class XiaohongshuSearchRecommendations:
			
@@ -37,10 +44,9 @@ class XiaohongshuSearchRecommendations:
 
				         if results_dir:
			
 
				             self.results_base_dir = results_dir
			
 
				         else:
			
 
				-            # 默认使用项目根目录的 data/search_recommendations 文件夹
			
 
				-            script_dir = os.path.dirname(os.path.abspath(__file__))
			
 
				-            project_root = os.path.dirname(os.path.dirname(script_dir))
			
 
				-            self.results_base_dir = os.path.join(project_root, "data", "search_recommendations")
			
 
				+            # 默认从配置读取
			
 
				+
			
 
				+            self.results_base_dir = get_data_dir("search_recommendations")
			
 
				 
			
 
				         # 缓存设置
			
 
				         self.enable_cache = enable_cache
			
@@ -311,8 +317,8 @@ def main():
 
				     parser.add_argument(
			
 
				         '--results-dir',
			
 
				         type=str,
			
 
				-        default='data/search_recommendations',
			
 
				-        help='结果输出目录 (默认: data/search_recommendations)'
			
 
				+        default=None,
			
 
				+        help='结果输出目录 (默认: 从配置读取)'
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         '--keyword',
			
--- a/script/search_tagwords/douyin_search_tagword.py
+++ b/script/search_tagwords/douyin_search_tagword.py
@@ -10,6 +10,13 @@ import os
 
				 import argparse
			
 
				 from datetime import datetime
			
 
				 from typing import Dict, Any
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加项目根目录到路径并导入配置
			
 
				+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
			
 
				+from lib.config import get_data_dir
			
 
				+
			
 
				 
			
 
				 
			
 
				 class DouyinSearchTagWord:
			
@@ -32,10 +39,9 @@ class DouyinSearchTagWord:
 
				         if results_dir:
			
 
				             self.results_base_dir = results_dir
			
 
				         else:
			
 
				-            # 默认使用项目根目录的 data/search_tagwords 文件夹
			
 
				-            script_dir = os.path.dirname(os.path.abspath(__file__))
			
 
				-            project_root = os.path.dirname(os.path.dirname(script_dir))
			
 
				-            self.results_base_dir = os.path.join(project_root, "data", "search_tagwords")
			
 
				+            # 默认从配置读取
			
 
				+
			
 
				+            self.results_base_dir = get_data_dir("search_tagwords")
			
 
				 
			
 
				     def get_tagwords(self, keyword: str, timeout: int = 30) -> Dict[str, Any]:
			
 
				         """
			
@@ -103,8 +109,8 @@ def main():
 
				     parser.add_argument(
			
 
				         '--results-dir',
			
 
				         type=str,
			
 
				-        default='data/search_tagwords',
			
 
				-        help='结果输出目录 (默认: data/search_tagwords)'
			
 
				+        default=None,
			
 
				+        help='结果输出目录 (默认: 从配置读取)'
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         '--keyword',
			
--- a/script/search_tagwords/xiaohongshu_search_hashtag.py
+++ b/script/search_tagwords/xiaohongshu_search_hashtag.py
@@ -11,6 +11,13 @@ import os
 
				 import argparse
			
 
				 from datetime import datetime
			
 
				 from typing import Dict, Any
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加项目根目录到路径并导入配置
			
 
				+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
			
 
				+from lib.config import get_data_dir
			
 
				+
			
 
				 
			
 
				 
			
 
				 class XiaohongshuSearchHashtag:
			
@@ -33,10 +40,9 @@ class XiaohongshuSearchHashtag:
 
				         if results_dir:
			
 
				             self.results_base_dir = results_dir
			
 
				         else:
			
 
				-            # 默认使用项目根目录的 data/search_tagwords 文件夹
			
 
				-            script_dir = os.path.dirname(os.path.abspath(__file__))
			
 
				-            project_root = os.path.dirname(os.path.dirname(script_dir))
			
 
				-            self.results_base_dir = os.path.join(project_root, "data", "search_tagwords")
			
 
				+            # 默认从配置读取
			
 
				+
			
 
				+            self.results_base_dir = get_data_dir("search_tagwords")
			
 
				 
			
 
				     def get_hashtags(self, prompt: str, timeout: int = 60) -> Dict[str, Any]:
			
 
				         """
			
@@ -112,8 +118,8 @@ def main():
 
				     parser.add_argument(
			
 
				         '--results-dir',
			
 
				         type=str,
			
 
				-        default='data/search_tagwords',
			
 
				-        help='结果输出目录 (默认: data/search_tagwords)'
			
 
				+        default=None,
			
 
				+        help='结果输出目录 (默认: 从配置读取)'
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         '--keyword',