data_main.py 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. from common import Material, Feishu
  2. from extract_data.douyin.douyin_author import douyinAuthor
  3. from extract_data.kuaishou.kuaishou_author import kuaishouAuthor
  4. import schedule
  5. import time
  6. import concurrent.futures
  7. import threading
  8. import os
  9. # 控制读写速度的参数
  10. MAX_BPS = 10 * 1024 * 1024 # 120MB/s
  11. MAX_WORKERS = os.cpu_count() * 2 # 线程池最大工作线程数量
  12. READ_WRITE_CHUNK_SIZE = 1024 * 1024 # 每次读写的块大小 (1MB)
  13. SLEEP_INTERVAL = READ_WRITE_CHUNK_SIZE / MAX_BPS # 控制每次读写的延迟时间
  14. # 全局锁,用于同步读写操作
  15. lock = threading.Lock()
  16. # 定义读取表格的函数
  17. def douyin_start(user_data):
  18. print(f"执行抖音数据抓取{user_data}")
  19. douyinAuthor.get_videoList(user_data)
  20. def kuaishou_start(user_data):
  21. print(f"执行快手数据抓取{user_data}")
  22. kuaishouAuthor.get_kuaishou_videoList(user_data)
  23. # 定义定时任务
  24. def douyin_task():
  25. data = Material.get_all_user("douyin")
  26. # 创建一个线程池
  27. with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
  28. with lock:
  29. start_time = time.time()
  30. time.sleep(SLEEP_INTERVAL)
  31. end_time = time.time()
  32. elapsed_time = end_time - start_time
  33. if elapsed_time < SLEEP_INTERVAL:
  34. time.sleep(SLEEP_INTERVAL - elapsed_time)
  35. futures = {executor.submit(douyin_start, user_data): user_data for user_data in data}
  36. # 等待所有任务执行完成
  37. for future in concurrent.futures.as_completed(futures):
  38. # 获取每个任务的执行结果
  39. result = future.result()
  40. print("处理结果:", result)
  41. print("抖音数据抓取定时任务执行完成")
  42. # 定义定时任务
  43. def kuanshou_task():
  44. data = Material.get_all_user("kuaishou")
  45. # 创建一个线程池
  46. with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
  47. with lock:
  48. start_time = time.time()
  49. time.sleep(SLEEP_INTERVAL)
  50. end_time = time.time()
  51. elapsed_time = end_time - start_time
  52. if elapsed_time < SLEEP_INTERVAL:
  53. time.sleep(SLEEP_INTERVAL - elapsed_time)
  54. futures = {executor.submit(kuaishou_start, user_data): user_data for user_data in data}
  55. # 等待所有任务执行完成
  56. for future in concurrent.futures.as_completed(futures):
  57. # 获取每个任务的执行结果
  58. result = future.result()
  59. print("处理结果:", result)
  60. print("快手数据抓取定时任务执行完成.")
  61. schedule.every(8).hours.do(douyin_task)
  62. schedule.every(8).hours.do(kuanshou_task)
  63. douyin_task()
  64. kuanshou_task()
  65. # 持续运行,直到手动终止
  66. while True:
  67. schedule.run_pending()
  68. time.sleep(1)