data_main.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. # -*- coding: utf-8 -*-
  2. from common import Material, Feishu
  3. from extract_data.douyin.douyin_author import douyinAuthor
  4. from extract_data.kuaishou.kuaishou_author import kuaishouAuthor
  5. import schedule
  6. import time
  7. import concurrent.futures
  8. import threading
  9. import os
  10. # 控制读写速度的参数
  11. MAX_BPS = 10 * 1024 * 1024 # 120MB/s
  12. MAX_WORKERS = os.cpu_count() * 2 # 线程池最大工作线程数量
  13. READ_WRITE_CHUNK_SIZE = 1024 * 1024 # 每次读写的块大小 (1MB)
  14. SLEEP_INTERVAL = READ_WRITE_CHUNK_SIZE / MAX_BPS # 控制每次读写的延迟时间
  15. # 全局锁,用于同步读写操作
  16. lock = threading.Lock()
  17. # 定义读取表格的函数
  18. def douyin_start(user_data):
  19. print(f"执行抖音数据抓取{user_data}")
  20. douyinAuthor.get_videoList(user_data)
  21. def kuaishou_start(user_data):
  22. print(f"执行快手数据抓取{user_data}")
  23. kuaishouAuthor.get_kuaishou_videoList(user_data)
  24. # 定义定时任务
  25. def douyin_task():
  26. data = Material.get_all_user("douyin")
  27. # 创建一个线程池
  28. with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
  29. with lock:
  30. start_time = time.time()
  31. time.sleep(SLEEP_INTERVAL)
  32. end_time = time.time()
  33. elapsed_time = end_time - start_time
  34. if elapsed_time < SLEEP_INTERVAL:
  35. time.sleep(SLEEP_INTERVAL - elapsed_time)
  36. futures = {executor.submit(douyin_start, user_data): user_data for user_data in data}
  37. # 等待所有任务执行完成
  38. for future in concurrent.futures.as_completed(futures):
  39. # 获取每个任务的执行结果
  40. result = future.result()
  41. print("处理结果:", result)
  42. print("抖音数据抓取定时任务执行完成")
  43. # 定义定时任务
  44. def kuanshou_task():
  45. data = Material.get_all_user("kuaishou")
  46. # 创建一个线程池
  47. with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
  48. with lock:
  49. start_time = time.time()
  50. time.sleep(SLEEP_INTERVAL)
  51. end_time = time.time()
  52. elapsed_time = end_time - start_time
  53. if elapsed_time < SLEEP_INTERVAL:
  54. time.sleep(SLEEP_INTERVAL - elapsed_time)
  55. futures = {executor.submit(kuaishou_start, user_data): user_data for user_data in data}
  56. # 等待所有任务执行完成
  57. for future in concurrent.futures.as_completed(futures):
  58. # 获取每个任务的执行结果
  59. result = future.result()
  60. print("处理结果:", result)
  61. print("快手数据抓取定时任务执行完成.")
  62. schedule.every(8).hours.do(douyin_task)
  63. schedule.every(8).hours.do(kuanshou_task)
  64. douyin_task()
  65. kuanshou_task()
  66. # 持续运行,直到手动终止
  67. while True:
  68. schedule.run_pending()
  69. time.sleep(1)