conf_task.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. import os
  2. import sys
  3. import time
  4. import requests
  5. from flask import Flask, request
  6. from flask import jsonify
  7. sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
  8. from common.db.mysql_help import MysqlHelper
  9. from user_spider.user_info import *
  10. app = Flask(__name__)
  11. app.config['JSON_AS_ASCII'] = False
  12. # 只接受get方法访问
  13. @app.route("/v1/crawler/source/getall", methods=["GET"])
  14. def getSource():
  15. try:
  16. # 获取传入的params参数
  17. get_data = request.args.to_dict()
  18. # # 对参数进行操作
  19. sql = 'select source, task_type, spider_name, machine, source_desc, task_type_desc, spider_name_desc from crawler_source'
  20. result = MysqlHelper.get_values(sql)
  21. if not result:
  22. return jsonify({'code': '200', 'result': [], 'message': '没有更多数据'})
  23. source_list = list()
  24. for source, task_type, spider_name, machine, source_desc, task_type_desc, spider_name_desc in result:
  25. source_dict = {
  26. 'task_type': [
  27. {
  28. 'type': task_type,
  29. 'description': task_type_desc,
  30. 'spider': {
  31. 'spider_name': spider_name,
  32. 'description': spider_name_desc
  33. }
  34. }
  35. ],
  36. 'description': source_desc,
  37. 'source': source,
  38. 'machine': machine
  39. }
  40. source_list.append(source_dict)
  41. except Exception as e:
  42. return jsonify({'code': '400', 'message': '获取数据源信息失败'})
  43. return jsonify({'code': '200', 'result': source_list})
  44. @app.route("/v1/crawler/task/insert", methods=["POST"])
  45. def insertTask():
  46. try:
  47. data = request.json
  48. outer_info = data.get('spider_link')
  49. source = data.get('source')
  50. exist_outer_info = list()
  51. s_sql = f"""select spider_link from crawler_task where source='{source}'"""
  52. result = MysqlHelper.get_values(s_sql)
  53. if result:
  54. for outer_link in outer_info:
  55. for link_tuple in result:
  56. if outer_link in link_tuple[0]:
  57. exist_outer_info.append(outer_link)
  58. if exist_outer_info:
  59. return jsonify({'code': 200, 'message': '名单重复', 'repeat_list': exist_outer_info})
  60. # 字段转换
  61. tag_name_list = []
  62. content_tag_list = []
  63. user_tag = data['user_tag']
  64. user_content_tag = data['user_content_tag']
  65. for tag in user_tag:
  66. tag_name_list.append(tag['tagName'])
  67. for tag in user_content_tag:
  68. content_tag_list.append(tag['tagName'])
  69. if data['min_publish_time']:
  70. data['min_publish_time'] = int(data['min_publish_time'] / 1000)
  71. else:
  72. data['min_publish_time'] = 0
  73. if data['min_publish_day']:
  74. data['min_publish_day'] = 0
  75. data['next_time'] = int(data['next_time'] / 1000)
  76. data['insert_time'] = int(time.time())
  77. data['update_time'] = int(time.time())
  78. data['spider_link'] = str(data['spider_link'])
  79. data['spider_rule'] = str(data['spider_rule'])
  80. data['user_tag'] = ','.join(str(i) for i in tag_name_list)
  81. data['user_content_tag'] = ','.join(str(i) for i in content_tag_list)
  82. # data['crawler_interval'] = data.pop('interval')
  83. # 获取到一个以键且为逗号分隔的字符串,返回一个字符串
  84. keys = ','.join(data.keys())
  85. values = ','.join(['%s'] * len(data))
  86. sql = 'insert into {table}({keys}) VALUES({values})'.format(table='crawler_task', keys=keys, values=values)
  87. MysqlHelper.insert_values(sql, tuple(data.values()))
  88. except Exception as e:
  89. return jsonify({'code': 400, 'message': '任务写入失败'})
  90. return jsonify({'code': 200, 'message': 'task create success'})
  91. @app.route("/v1/crawler/task/gettask", methods=["GET"])
  92. def getAllTask():
  93. try:
  94. get_data = request.args.to_dict()
  95. page = int(get_data.get('page', 1))
  96. offset = int(get_data.get('offset', 10))
  97. start_count = (page * offset) - offset
  98. end_count = page * offset
  99. sql = f"""select task_id, task_name from crawler_task limit {start_count}, {end_count}"""
  100. result = MysqlHelper.get_values(sql)
  101. if not result:
  102. return jsonify({'code': '200', 'result': [], 'message': '没有更多任务'})
  103. source_list = list()
  104. for task_id, task_name in result:
  105. data = dict(
  106. task_id=task_id,
  107. task_name=task_name,
  108. )
  109. source_list.append(data)
  110. t_sql = f"""select count(*) from crawler_task"""
  111. t_res = MysqlHelper.get_values(t_sql)
  112. total = t_res[0][0]
  113. except Exception as e:
  114. return jsonify({"code": "400", 'message': "任务列表获取失败"})
  115. return jsonify({'code': '200', 'result': source_list, 'total': total})
  116. @app.route("/v1/crawler/task/getone", methods=["GET"])
  117. def getOneTask():
  118. try:
  119. get_data = request.args.to_dict()
  120. task_id = get_data['task_id']
  121. sql = f'select task_id, spider_link from crawler_task where task_id={task_id}'
  122. result = MysqlHelper.get_values(sql)
  123. if not result:
  124. return jsonify({'code': '200', 'result': [], 'message': 'no data'})
  125. for task_id, spider_link in result:
  126. data = dict(
  127. task_id=task_id,
  128. spider_link=eval(spider_link),
  129. )
  130. except Exception as e:
  131. return jsonify({'code': '400', "message": "获取任务信息失败"})
  132. return jsonify({'code': '200', 'result': data})
  133. @app.route("/v1/crawler/task/update", methods=["POST"])
  134. def updateTask():
  135. try:
  136. task_id = request.json.get('task_id')
  137. spider_link = request.json.get('spider_link')
  138. sql = f"""UPDATE crawler_task SET spider_link="{spider_link}" where task_id = {task_id}"""
  139. result = MysqlHelper.update_values(sql)
  140. if result:
  141. return jsonify({'code': 200, 'message': 'task update success'})
  142. else:
  143. return jsonify({'code': 400, 'message': 'task update faild'})
  144. except Exception as e:
  145. return jsonify({'code': 400, 'message': '任务更新失败'})
  146. def get_user_info(source):
  147. source_spider = {
  148. 'xigua': xigua_user_info
  149. }
  150. return source_spider.get(source)
  151. @app.route("/v1/crawler/author/create", methods=["POST"])
  152. def createUser():
  153. get_media_url = 'http://videotest-internal.yishihui.com/longvideoapi/user/virtual/crawler/registerVirtualUser'
  154. spider_link = request.json.get('spider_link')
  155. source = request.json.get('source')
  156. task_type = request.json.get('task_type')
  157. applets_status = request.json.get('applets_status')
  158. app_status = request.json.get('app_status')
  159. user_tag = request.json.get('user_tag')
  160. user_content_tag = request.json.get('user_content_tag')
  161. success_list = list()
  162. fail_list = list()
  163. for author_url in spider_link:
  164. try:
  165. post_data = {
  166. # 'count': 1, # (必须)账号个数:传1
  167. # 'accountType': 4, # (必须)账号类型 :传 4 app虚拟账号
  168. 'pwd': '', # 密码 默认 12346
  169. 'nickName': '', # 昵称 默认 vuser......
  170. 'avatarUrl': '',
  171. # 头像Url 默认 http://weapppiccdn.yishihui.com/resources/images/pic_normal.png
  172. 'tagName': user_tag, # 多条数据用英文逗号分割
  173. }
  174. response = requests.post(url=get_media_url, params=post_data)
  175. media_id = response.json()['data']
  176. f_sql = f"""select spider_link from crawler_author_map where spider_link="{author_url}" """
  177. result = MysqlHelper.get_values(f_sql)
  178. if result:
  179. success_list.append(author_url)
  180. continue
  181. else:
  182. tag_name_list = []
  183. content_tag_list = []
  184. for tag in user_tag:
  185. tag_name_list.append(tag['tagName'])
  186. for tag in user_content_tag:
  187. content_tag_list.append(tag['tagName'])
  188. data = dict(
  189. spider_link=author_url,
  190. media_id=media_id,
  191. source=source,
  192. task_type=task_type,
  193. applets_status=applets_status,
  194. app_status=app_status,
  195. user_tag=','.join(str(i) for i in tag_name_list),
  196. user_content_tag=','.join(str(i) for i in content_tag_list),
  197. insert_time=int(time.time()),
  198. update_time=int(time.time())
  199. )
  200. keys = ','.join(data.keys())
  201. values = ','.join(['%s'] * len(data))
  202. table = 'crawler_author_map'
  203. sql = f"""insert into {table}({keys}) VALUES({values})"""
  204. result = MysqlHelper.insert_values(sql, tuple(data.values()))
  205. if not result:
  206. fail_list.append(author_url)
  207. else:
  208. success_list.append(author_url)
  209. except Exception as e:
  210. fail_list.append(author_url)
  211. continue
  212. return jsonify({'code': 200, 'result': {'success': success_list, 'fail': fail_list}})
  213. if __name__ == "__main__":
  214. app.run(debug=True, port=5050)