xiaohongshu_search.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707
  1. #!/usr/bin/env python3
  2. """
  3. 小红书笔记搜索工具
  4. 根据关键词搜索小红书笔记,支持多种筛选条件
  5. """
  6. import requests
  7. import json
  8. import os
  9. import argparse
  10. import time
  11. import hashlib
  12. import re
  13. from datetime import datetime
  14. from typing import Dict, Any, Optional, Tuple
  15. import sys
  16. from pathlib import Path
  17. # 添加项目根目录到路径并导入配置
  18. sys.path.insert(0, str(Path(__file__).parent.parent.parent))
  19. from lib.config import get_data_dir
  20. from copy import deepcopy
  21. from pathlib import Path
  22. class XiaohongshuSearch:
  23. """小红书笔记搜索API封装类"""
  24. BASE_URL = "http://47.84.182.56:8001"
  25. TOOL_NAME = "xhs_note_search"
  26. PLATFORM = "xiaohongshu"
  27. def __init__(self, results_dir: str = None, use_cache: bool = True):
  28. """
  29. 初始化API客户端
  30. Args:
  31. results_dir: 结果输出目录,默认为项目根目录下的 data/search 文件夹
  32. use_cache: 是否启用缓存,默认为 True
  33. """
  34. self.api_url = f"{self.BASE_URL}/tools/call/{self.TOOL_NAME}"
  35. self.use_cache = use_cache
  36. # 设置结果输出目录
  37. if results_dir:
  38. self.results_base_dir = results_dir
  39. else:
  40. # 默认从配置读取
  41. self.results_base_dir = get_data_dir("search")
  42. def _sanitize_keyword(self, keyword: str) -> str:
  43. """
  44. 清理关键词,使其可以作为文件夹名称
  45. Args:
  46. keyword: 原始关键词
  47. Returns:
  48. 清理后的关键词
  49. """
  50. # 替换不能用作文件夹名称的字符
  51. # Windows: < > : " / \ | ? *
  52. # Unix: /
  53. # 替换为下划线
  54. sanitized = re.sub(r'[<>:"/\\|?*]', '_', keyword)
  55. # 移除首尾空格
  56. sanitized = sanitized.strip()
  57. # 移除首尾的点号(Windows不允许)
  58. sanitized = sanitized.strip('.')
  59. # 如果清理后为空,使用默认名称
  60. if not sanitized:
  61. sanitized = "unnamed"
  62. # 限制长度(文件系统通常限制255字符)
  63. if len(sanitized) > 200:
  64. sanitized = sanitized[:200]
  65. return sanitized
  66. def _get_cache_key(
  67. self,
  68. keyword: str,
  69. content_type: str,
  70. sort_type: str,
  71. publish_time: str,
  72. cursor: str
  73. ) -> str:
  74. """
  75. 生成缓存键(基于搜索参数的哈希)
  76. Args:
  77. 搜索参数
  78. Returns:
  79. 缓存键(MD5哈希值)
  80. """
  81. # 将所有参数组合成字符串
  82. params_str = f"{keyword}|{content_type}|{sort_type}|{publish_time}|{cursor}"
  83. # 生成 MD5 哈希
  84. return hashlib.md5(params_str.encode('utf-8')).hexdigest()
  85. def _get_latest_cache(
  86. self,
  87. keyword: str,
  88. cache_key: str,
  89. content_type: str,
  90. sort_type: str,
  91. publish_time: str
  92. ) -> Optional[Tuple[str, str]]:
  93. """
  94. 获取最新的缓存文件(匹配搜索参数)
  95. Args:
  96. keyword: 搜索关键词
  97. cache_key: 缓存键(未使用,保留接口兼容)
  98. content_type: 内容类型
  99. sort_type: 排序方式
  100. publish_time: 发布时间
  101. Returns:
  102. (raw_filepath, clean_filepath) 或 None(如果没有缓存)
  103. """
  104. # 清理关键词用于文件夹名称
  105. safe_keyword = self._sanitize_keyword(keyword)
  106. base_dir = os.path.join(self.results_base_dir, "xiaohongshu_search", safe_keyword)
  107. raw_dir = os.path.join(base_dir, "raw")
  108. clean_dir = os.path.join(base_dir, "clean")
  109. # 检查目录是否存在
  110. if not os.path.exists(raw_dir) or not os.path.exists(clean_dir):
  111. return None
  112. # 获取所有文件并筛选匹配参数的文件
  113. try:
  114. # 生成参数后缀用于匹配文件名
  115. param_suffix = self._get_filename_suffix(content_type, sort_type, publish_time)
  116. raw_files = list(Path(raw_dir).glob("*.json"))
  117. clean_files = list(Path(clean_dir).glob("*.json"))
  118. if not raw_files or not clean_files:
  119. return None
  120. # 筛选匹配参数的文件
  121. matching_raw_files = [
  122. f for f in raw_files
  123. if param_suffix in f.name
  124. ]
  125. matching_clean_files = [
  126. f for f in clean_files
  127. if param_suffix in f.name
  128. ]
  129. if not matching_raw_files or not matching_clean_files:
  130. return None
  131. # 按修改时间排序,最新的在前
  132. matching_raw_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)
  133. matching_clean_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)
  134. # 返回最新的匹配文件路径
  135. return (str(matching_raw_files[0]), str(matching_clean_files[0]))
  136. except Exception:
  137. return None
  138. def _load_cached_result(self, raw_filepath: str) -> Optional[Dict[str, Any]]:
  139. """
  140. 加载缓存的原始数据
  141. Args:
  142. raw_filepath: 原始数据文件路径
  143. Returns:
  144. 原始数据字典 或 None
  145. """
  146. try:
  147. with open(raw_filepath, 'r', encoding='utf-8') as f:
  148. data = json.load(f)
  149. # 兼容旧格式和新格式
  150. if "api_response" in data:
  151. # 新格式:包含 search_params 和 api_response
  152. return data["api_response"]
  153. else:
  154. # 旧格式:直接是 API 响应
  155. return data
  156. except Exception:
  157. return None
  158. def search(
  159. self,
  160. keyword: str,
  161. content_type: str = "不限",
  162. sort_type: str = "综合",
  163. publish_time: str = "不限",
  164. cursor: str = "",
  165. timeout: int = 30,
  166. max_retries: int = 5,
  167. retry_delay: int = 2,
  168. force: bool = False
  169. ) -> tuple[Dict[str, Any], bool]:
  170. """
  171. 搜索小红书笔记,带自动重试机制和缓存
  172. Args:
  173. keyword: 搜索关键词
  174. content_type: 内容类型,可选值:不限、视频、图文,默认为'不限'
  175. sort_type: 排序方式,可选值:综合、最新、最多点赞、最多评论,默认为'综合'
  176. publish_time: 发布时间筛选,可选值:不限、一天内、一周内、半年内,默认为'不限'
  177. cursor: 翻页游标,第一页默认为空,下一页的游标在上一页的返回值中获取
  178. timeout: 请求超时时间(秒),默认30秒
  179. max_retries: 最大重试次数,默认5次
  180. retry_delay: 重试延迟(秒),默认2秒,每次重试会指数增长
  181. force: 强制重新请求API,忽略缓存,默认为 False
  182. Returns:
  183. (原始数据, 是否来自缓存) 的元组
  184. Raises:
  185. requests.exceptions.RequestException: 所有重试失败后抛出异常
  186. """
  187. # 检查缓存(如果启用且未强制刷新)
  188. if self.use_cache and not force:
  189. cache_key = self._get_cache_key(keyword, content_type, sort_type, publish_time, cursor)
  190. cached_files = self._get_latest_cache(keyword, cache_key, content_type, sort_type, publish_time)
  191. if cached_files:
  192. raw_filepath, clean_filepath = cached_files
  193. cached_result = self._load_cached_result(raw_filepath)
  194. if cached_result:
  195. print(f"✓ 使用缓存数据: {raw_filepath}")
  196. return cached_result, True # 返回缓存标记
  197. payload = {
  198. "keyword": keyword,
  199. "content_type": content_type,
  200. "sort_type": sort_type,
  201. "publish_time": publish_time,
  202. "cursor": cursor
  203. }
  204. last_exception = None
  205. for attempt in range(max_retries):
  206. try:
  207. if attempt > 0:
  208. # 指数退避策略:每次重试延迟时间翻倍
  209. wait_time = retry_delay * (2 ** (attempt - 1))
  210. print(f"等待 {wait_time} 秒后进行第 {attempt + 1} 次重试...")
  211. time.sleep(wait_time)
  212. print(f"正在搜索关键词: {keyword} (尝试 {attempt + 1}/{max_retries})")
  213. response = requests.post(
  214. self.api_url,
  215. json=payload,
  216. timeout=timeout,
  217. headers={"Content-Type": "application/json"}
  218. )
  219. response.raise_for_status()
  220. raw_result = response.json()
  221. # 如果 result 字段是字符串,需要解析成 JSON 对象
  222. if 'result' in raw_result and isinstance(raw_result['result'], str):
  223. try:
  224. raw_result['result'] = json.loads(raw_result['result'])
  225. except json.JSONDecodeError:
  226. pass # 如果解析失败,保持原样
  227. # raw_result 就是 raw 数据(已解析 result,保留完整结构)
  228. print(f"✓ 搜索成功!")
  229. return raw_result, False # 返回新数据标记
  230. except requests.exceptions.Timeout as e:
  231. last_exception = e
  232. print(f"✗ 请求超时: {e}")
  233. except requests.exceptions.ConnectionError as e:
  234. last_exception = e
  235. print(f"✗ 连接错误: {e}")
  236. except requests.exceptions.HTTPError as e:
  237. last_exception = e
  238. status_code = e.response.status_code if e.response else "未知"
  239. print(f"✗ HTTP错误 {status_code}: {e}")
  240. # 如果是客户端错误(4xx),不重试
  241. if e.response and 400 <= e.response.status_code < 500:
  242. print(f"客户端错误,停止重试")
  243. raise
  244. except requests.exceptions.RequestException as e:
  245. last_exception = e
  246. print(f"✗ 请求失败: {e}")
  247. # 所有重试都失败
  248. print(f"✗ 已达到最大重试次数 ({max_retries}),请求失败")
  249. raise last_exception
  250. def _extract_clean_data(self, result: Dict[str, Any]) -> Dict[str, Any]:
  251. """
  252. 提取并清理数据,生成扁平化的结构
  253. Args:
  254. result: 已处理的结果字典
  255. Returns:
  256. 包含笔记列表和分页信息的字典
  257. """
  258. result_data = result.get("result", {})
  259. if not isinstance(result_data, dict):
  260. return {"has_more": False, "next_cursor": "", "notes": []}
  261. data = result_data.get("data", {})
  262. notes = data.get("data", [])
  263. clean_notes = []
  264. for note in notes:
  265. note_card = note.get("note_card", {})
  266. user = note_card.get("user", {})
  267. interact_info = note_card.get("interact_info", {})
  268. # 处理 image_list:从字典格式提取 URL
  269. image_list_raw = note_card.get("image_list", [])
  270. images = []
  271. for img in image_list_raw:
  272. if isinstance(img, dict) and "image_url" in img:
  273. images.append(img["image_url"])
  274. elif isinstance(img, str):
  275. images.append(img)
  276. # 不存在的字段统一用 None/null 表示
  277. note_id = note.get("id")
  278. clean_note = {
  279. "channel_content_id": note_id or None,
  280. "link": f"https://www.xiaohongshu.com/explore/{note_id}" if note_id else None,
  281. "comment_count": interact_info.get("comment_count"),
  282. "images": images if images else [],
  283. "like_count": interact_info.get("liked_count"),
  284. "desc": note_card.get("desc") or None, # 摘要(搜索接口返回)
  285. "body_text": None, # 完整正文需要调用详情接口获取
  286. "title": note_card.get("display_title") or None,
  287. "collect_count": interact_info.get("collected_count"),
  288. "channel_account_id": user.get("user_id") or None,
  289. "channel_account_name": user.get("nick_name") or None,
  290. "content_type": note_card.get("type") or None,
  291. "video": None, # 搜索结果中没有视频字段
  292. "shared_count": interact_info.get("shared_count")
  293. }
  294. clean_notes.append(clean_note)
  295. # Return clean data with pagination info
  296. return {
  297. "has_more": data.get("has_more", False),
  298. "next_cursor": data.get("next_cursor", ""),
  299. "notes": clean_notes
  300. }
  301. def _get_filename_suffix(
  302. self,
  303. content_type: str,
  304. sort_type: str,
  305. publish_time: str
  306. ) -> str:
  307. """
  308. 根据搜索参数生成文件名后缀
  309. Args:
  310. content_type: 内容类型
  311. sort_type: 排序方式
  312. publish_time: 发布时间
  313. Returns:
  314. 文件名后缀字符串
  315. """
  316. # 直接使用原始参数值,不做映射,全部显示
  317. parts = [content_type, sort_type, publish_time]
  318. return "_" + "_".join(parts)
  319. def save_result(
  320. self,
  321. keyword: str,
  322. raw_result: Dict[str, Any],
  323. page: int = 1,
  324. content_type: str = "不限",
  325. sort_type: str = "综合",
  326. publish_time: str = "不限",
  327. cursor: str = ""
  328. ) -> tuple[str, str]:
  329. """
  330. 保存原始数据和清理后数据到不同的目录
  331. 目录结构:
  332. data/search/xiaohongshu_search/
  333. ├── {keyword}/
  334. │ ├── raw/ # 原始数据(完整 API 响应,含分页信息)
  335. │ │ └── {timestamp}_page{页码}_{参数}.json
  336. │ └── clean/ # 清理后数据(扁平化笔记数组)
  337. │ └── {timestamp}_page{页码}_{参数}.json
  338. Args:
  339. keyword: 搜索关键词
  340. raw_result: 原始数据(已解析 result 字段)
  341. page: 页码
  342. content_type: 内容类型
  343. sort_type: 排序方式
  344. publish_time: 发布时间
  345. cursor: 翻页游标
  346. Returns:
  347. (原始数据路径, 清理后数据路径) 的元组
  348. """
  349. # 清理关键词用于文件夹名称
  350. safe_keyword = self._sanitize_keyword(keyword)
  351. # 创建目录结构
  352. base_dir = os.path.join(self.results_base_dir, "xiaohongshu_search", safe_keyword)
  353. raw_dir = os.path.join(base_dir, "raw")
  354. clean_dir = os.path.join(base_dir, "clean")
  355. os.makedirs(raw_dir, exist_ok=True)
  356. os.makedirs(clean_dir, exist_ok=True)
  357. # 生成文件名(包含参数信息)
  358. timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
  359. param_suffix = self._get_filename_suffix(content_type, sort_type, publish_time)
  360. filename = f"{timestamp}_page{page}{param_suffix}.json"
  361. raw_filepath = os.path.join(raw_dir, filename)
  362. clean_filepath = os.path.join(clean_dir, filename)
  363. # 添加搜索参数到 raw 数据
  364. raw_data_with_meta = {
  365. "search_params": {
  366. "keyword": keyword,
  367. "content_type": content_type,
  368. "sort_type": sort_type,
  369. "publish_time": publish_time,
  370. "cursor": cursor,
  371. "page": page,
  372. "timestamp": timestamp
  373. },
  374. "api_response": raw_result
  375. }
  376. # 保存原始结果(包含元数据)
  377. with open(raw_filepath, 'w', encoding='utf-8') as f:
  378. json.dump(raw_data_with_meta, f, ensure_ascii=False, indent=2)
  379. # 提取并保存清理后的数据
  380. clean_data = self._extract_clean_data(raw_result)
  381. # 添加搜索参数到 clean 数据
  382. clean_data_with_meta = {
  383. "search_params": {
  384. "keyword": keyword,
  385. "content_type": content_type,
  386. "sort_type": sort_type,
  387. "publish_time": publish_time,
  388. "cursor": cursor,
  389. "page": page,
  390. "timestamp": timestamp
  391. },
  392. "has_more": clean_data["has_more"],
  393. "next_cursor": clean_data["next_cursor"],
  394. "notes": clean_data["notes"]
  395. }
  396. with open(clean_filepath, 'w', encoding='utf-8') as f:
  397. json.dump(clean_data_with_meta, f, ensure_ascii=False, indent=2)
  398. return raw_filepath, clean_filepath
  399. def main():
  400. """示例使用"""
  401. # 解析命令行参数
  402. parser = argparse.ArgumentParser(description='小红书笔记搜索工具')
  403. parser.add_argument(
  404. '--results-dir',
  405. type=str,
  406. default=None,
  407. help='结果输出目录 (默认: 从配置读取)'
  408. )
  409. parser.add_argument(
  410. '--keyword',
  411. type=str,
  412. required=True,
  413. help='搜索关键词 (必填)'
  414. )
  415. parser.add_argument(
  416. '--content-type',
  417. type=str,
  418. default='不限',
  419. choices=['不限', '视频', '图文'],
  420. help='内容类型 (默认: 不限)'
  421. )
  422. parser.add_argument(
  423. '--sort-type',
  424. type=str,
  425. default='综合',
  426. choices=['综合', '最新', '最多点赞', '最多评论'],
  427. help='排序方式 (默认: 综合)'
  428. )
  429. parser.add_argument(
  430. '--publish-time',
  431. type=str,
  432. default='不限',
  433. choices=['不限', '一天内', '一周内', '半年内'],
  434. help='发布时间筛选 (默认: 不限)'
  435. )
  436. parser.add_argument(
  437. '--cursor',
  438. type=str,
  439. default='',
  440. help='翻页游标 (默认为空,即第一页)'
  441. )
  442. parser.add_argument(
  443. '--page',
  444. type=int,
  445. default=1,
  446. help='页码标识,用于保存文件名 (默认: 1)'
  447. )
  448. parser.add_argument(
  449. '--max-retries',
  450. type=int,
  451. default=5,
  452. help='最大重试次数 (默认: 5)'
  453. )
  454. parser.add_argument(
  455. '--retry-delay',
  456. type=int,
  457. default=2,
  458. help='重试延迟秒数 (默认: 2)'
  459. )
  460. parser.add_argument(
  461. '--timeout',
  462. type=int,
  463. default=30,
  464. help='请求超时秒数 (默认: 30)'
  465. )
  466. parser.add_argument(
  467. '--force',
  468. action='store_true',
  469. help='强制重新请求API,忽略缓存'
  470. )
  471. parser.add_argument(
  472. '--no-cache',
  473. action='store_true',
  474. help='禁用缓存功能'
  475. )
  476. args = parser.parse_args()
  477. # 创建API客户端实例
  478. use_cache = not args.no_cache
  479. client = XiaohongshuSearch(results_dir=args.results_dir, use_cache=use_cache)
  480. # 执行搜索并保存
  481. try:
  482. raw_result, from_cache = client.search(
  483. args.keyword,
  484. args.content_type,
  485. args.sort_type,
  486. args.publish_time,
  487. args.cursor,
  488. timeout=args.timeout,
  489. max_retries=args.max_retries,
  490. retry_delay=args.retry_delay,
  491. force=args.force
  492. )
  493. # 只有新数据才保存
  494. if not from_cache:
  495. raw_filepath, clean_filepath = client.save_result(
  496. args.keyword,
  497. raw_result,
  498. args.page,
  499. args.content_type,
  500. args.sort_type,
  501. args.publish_time,
  502. args.cursor
  503. )
  504. print(f"Raw data saved to: {raw_filepath}")
  505. print(f"Clean data saved to: {clean_filepath}")
  506. else:
  507. print(f"Used cached data, no new files saved")
  508. except Exception as e:
  509. print(f"Error: {e}", file=__import__('sys').stderr)
  510. def search_xiaohongshu(
  511. keyword: str,
  512. content_type: str = "不限",
  513. sort_type: str = "综合",
  514. publish_time: str = "不限",
  515. page: int = 1,
  516. force: bool = False
  517. ) -> Dict[str, Any]:
  518. """
  519. 小红书笔记搜索
  520. Args:
  521. keyword: 搜索关键词
  522. content_type: 内容类型,可选:不限、视频、图文
  523. sort_type: 排序方式,可选:综合、最新、最多点赞、最多评论
  524. publish_time: 发布时间,可选:不限、一天内、一周内、半年内
  525. page: 页码(自动翻页)
  526. force: 强制刷新(忽略缓存)
  527. Returns:
  528. {
  529. "search_params": {...},
  530. "has_more": bool,
  531. "next_cursor": str,
  532. "notes": [...]
  533. }
  534. Examples:
  535. >>> # 基本使用
  536. >>> data = search_xiaohongshu("产品测试")
  537. >>> for note in data['notes']:
  538. ... print(f"{note['title']} - {note['like_count']} 赞")
  539. >>> # 带参数
  540. >>> data = search_xiaohongshu(
  541. ... keyword="产品测试",
  542. ... content_type="视频",
  543. ... sort_type="最新"
  544. ... )
  545. >>> # 翻页(自动处理 cursor)
  546. >>> page1 = search_xiaohongshu("产品测试", page=1)
  547. >>> page2 = search_xiaohongshu("产品测试", page=2)
  548. >>> page3 = search_xiaohongshu("产品测试", page=3)
  549. """
  550. # 创建客户端(使用默认配置)
  551. client = XiaohongshuSearch(use_cache=True)
  552. # 自动处理翻页游标
  553. cursor = ""
  554. if page > 1:
  555. # 读取上一页的 cursor
  556. prev_page_result = search_xiaohongshu(
  557. keyword=keyword,
  558. content_type=content_type,
  559. sort_type=sort_type,
  560. publish_time=publish_time,
  561. page=page - 1,
  562. force=False # 上一页使用缓存
  563. )
  564. cursor = prev_page_result.get('next_cursor', '')
  565. # 搜索(内部处理重试、超时等)
  566. raw_result, from_cache = client.search(
  567. keyword=keyword,
  568. content_type=content_type,
  569. sort_type=sort_type,
  570. publish_time=publish_time,
  571. cursor=cursor,
  572. force=force
  573. )
  574. # 只有新请求的数据才需要保存
  575. if not from_cache:
  576. _, clean_filepath = client.save_result(
  577. keyword=keyword,
  578. raw_result=raw_result,
  579. page=page,
  580. content_type=content_type,
  581. sort_type=sort_type,
  582. publish_time=publish_time,
  583. cursor=cursor
  584. )
  585. # 读取并返回数据
  586. with open(clean_filepath, 'r', encoding='utf-8') as f:
  587. return json.load(f)
  588. else:
  589. # 如果是缓存数据,直接提取 clean 数据返回
  590. clean_data = client._extract_clean_data(raw_result)
  591. # 添加搜索参数到 clean 数据
  592. timestamp = raw_result.get("search_params", {}).get("timestamp", "")
  593. clean_data_with_meta = {
  594. "search_params": {
  595. "keyword": keyword,
  596. "content_type": content_type,
  597. "sort_type": sort_type,
  598. "publish_time": publish_time,
  599. "cursor": cursor,
  600. "page": page,
  601. "timestamp": timestamp
  602. },
  603. "has_more": clean_data["has_more"],
  604. "next_cursor": clean_data["next_cursor"],
  605. "notes": clean_data["notes"]
  606. }
  607. return clean_data_with_meta
  608. if __name__ == "__main__":
  609. main()