|
@@ -122,7 +122,8 @@ def getMediaInfo():
|
|
|
for task_info in result:
|
|
|
media_id = task_info['media_id']
|
|
|
|
|
|
- media_info = requests.get(url=conf['select_media_url'], params={'uid': media_id},verify=False).json()['content']
|
|
|
+ media_info = requests.get(url=conf['select_media_url'], params={'uid': media_id}, verify=False).json()[
|
|
|
+ 'content']
|
|
|
media_name = media_info['longvideoNickName'] if media_info['longvideoNickName'] else media_info['nickName']
|
|
|
nick_name = task_info['nick_name']
|
|
|
spider_link = task_info['spider_link']
|
|
@@ -222,7 +223,7 @@ def get_repeat_list():
|
|
|
if result:
|
|
|
repeat_list.append(spider_link)
|
|
|
if repeat_list:
|
|
|
- return jsonify({'code': 400, 'message': '名单重复', 'repeat_list': repeat_list})
|
|
|
+ return jsonify({'code': 200, 'message': '抓取名单校验通过', 'repeat_list': repeat_list})
|
|
|
else:
|
|
|
|
|
|
return jsonify({'code': 200, 'message': '抓取名单校验通过', 'repeat_list': repeat_list})
|
|
@@ -487,85 +488,115 @@ def create_uid(task, task_id, spider_link):
|
|
|
content_tag_list.append(tag['tagName'])
|
|
|
user_tags = ','.join(str(i) for i in tag_name_list)
|
|
|
user_content_tags = ','.join(str(i) for i in content_tag_list)
|
|
|
-
|
|
|
+ repeat_list = []
|
|
|
+ create_user_list = []
|
|
|
for author_url in spider_link:
|
|
|
- now_time = int(time.time())
|
|
|
- time_array = time.localtime(now_time)
|
|
|
- str_time = time.strftime("%Y%m%d", time_array)
|
|
|
- # 生成创建用户的tag
|
|
|
-
|
|
|
- tags = ''
|
|
|
- if task['task_type'] == 'author':
|
|
|
- spider_task = '账号'
|
|
|
- tags_list = ['spider', spider_task, spider_platform, user_tags, task['content_category_str'], str_time]
|
|
|
-
|
|
|
- elif task['task_type'] == 'search':
|
|
|
- spider_task = '搜索'
|
|
|
- tags_list = ['spider', spider_task, spider_platform, user_tags, author_url, task['content_category_str'], str_time]
|
|
|
- elif task['task_type'] == 'board':
|
|
|
- spider_task = '榜单'
|
|
|
- mode_tags = task['mode_board_str']
|
|
|
-
|
|
|
- tags_list = ['spider', spider_task, spider_platform, user_tags, mode_tags, task['content_category_str'], str_time]
|
|
|
- elif task['task_type'] == 'recommend':
|
|
|
- spider_task = '推荐'
|
|
|
- mode_tags = task['mode_name_str'] + task['mode_board_str']
|
|
|
- tags_list = ['spider', spider_task, spider_platform, user_tags, mode_tags, task['content_category_str'], str_time]
|
|
|
+ repeat_sql = f'select * from crawler_author_map where spider_link="{author_url}"'
|
|
|
+ result = mysql_con.get_values(repeat_sql)
|
|
|
+ if result:
|
|
|
+ old_task_id = result[0]['task_id']
|
|
|
+ is_del = result[0]['is_del']
|
|
|
+ if task_id == old_task_id:
|
|
|
+ if is_del:
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ up_sql = f'update crawler_author_map set is_del=1 where spider_link="{author_url}"'
|
|
|
+ mysql_con.update_values(up_sql)
|
|
|
+ else:
|
|
|
+ if is_del:
|
|
|
+ repeat_list.append(author_url)
|
|
|
+ else:
|
|
|
+ up_sql = f'update crawler_author_map set task_id={task_id},is_del=1 where spider_link="{author_url}"'
|
|
|
+ mysql_con.update_values(up_sql)
|
|
|
else:
|
|
|
- tags_list = ['spider', spider_platform, user_tags, task['content_category_str'], str_time]
|
|
|
- for v in tags_list:
|
|
|
- if v:
|
|
|
- tags += str(v) + ','
|
|
|
- post_data = {
|
|
|
- # 'count': 1, # (必须)账号个数:传1
|
|
|
- # 'accountType': 4, # (必须)账号类型 :传 4 app虚拟账号
|
|
|
- 'pwd': '', # 密码 默认 12346
|
|
|
- 'nickName': '', # 昵称 默认 vuser......
|
|
|
- 'avatarUrl': '',
|
|
|
- # 头像Url 默认 http://weapppiccdn.yishihui.com/resources/images/pic_normal.png
|
|
|
- 'tagName': tags[:-1], # 多条数据用英文逗号分割
|
|
|
- }
|
|
|
- try:
|
|
|
- response = requests.post(url=conf['media_url'], params=post_data)
|
|
|
- media_id = response.json()['data']
|
|
|
- media_info = requests.get(url=conf['select_media_url'], params={'uid': media_id},verify=False).json()['content']
|
|
|
- except Exception as e:
|
|
|
- logging.warning(f'创建账户:{spider_link},失败,原因:{e}')
|
|
|
- fail_list.append(author_url)
|
|
|
- continue
|
|
|
+ create_user_list.append(author_url)
|
|
|
+ if repeat_list:
|
|
|
+ message = f'该任务和其他任务抓取名单重复:{repeat_list}'
|
|
|
+ return jsonify({'code': 400, 'message': message})
|
|
|
+ else:
|
|
|
+ for author_url in create_user_list:
|
|
|
+
|
|
|
+ now_time = int(time.time())
|
|
|
+ time_array = time.localtime(now_time)
|
|
|
+ str_time = time.strftime("%Y%m%d", time_array)
|
|
|
+ # 生成创建用户的tag
|
|
|
+ tags = ''
|
|
|
+ if task['task_type'] == 'author':
|
|
|
+ spider_task = '账号'
|
|
|
+ tags_list = ['spider', spider_task, spider_platform, user_tags, task['content_category_str'], str_time]
|
|
|
+
|
|
|
+ elif task['task_type'] == 'search':
|
|
|
+ spider_task = '搜索'
|
|
|
+ tags_list = ['spider', spider_task, spider_platform, user_tags, author_url,
|
|
|
+ task['content_category_str'], str_time]
|
|
|
+ elif task['task_type'] == 'board':
|
|
|
+ spider_task = '榜单'
|
|
|
+ mode_tags = task['mode_board_str']
|
|
|
+
|
|
|
+ tags_list = ['spider', spider_task, spider_platform, user_tags, mode_tags, task['content_category_str'],
|
|
|
+ str_time]
|
|
|
+ elif task['task_type'] == 'recommend':
|
|
|
+ spider_task = '推荐'
|
|
|
+ mode_tags = task['mode_name_str'] + task['mode_board_str']
|
|
|
+ tags_list = ['spider', spider_task, spider_platform, user_tags, mode_tags, task['content_category_str'],
|
|
|
+ str_time]
|
|
|
+ else:
|
|
|
+ tags_list = ['spider', spider_platform, user_tags, task['content_category_str'], str_time]
|
|
|
+ for v in tags_list:
|
|
|
+ if v:
|
|
|
+ tags += str(v) + ','
|
|
|
+ post_data = {
|
|
|
+ # 'count': 1, # (必须)账号个数:传1
|
|
|
+ # 'accountType': 4, # (必须)账号类型 :传 4 app虚拟账号
|
|
|
+ 'pwd': '', # 密码 默认 12346
|
|
|
+ 'nickName': '', # 昵称 默认 vuser......
|
|
|
+ 'avatarUrl': '',
|
|
|
+ # 头像Url 默认 http://weapppiccdn.yishihui.com/resources/images/pic_normal.png
|
|
|
+ 'tagName': tags[:-1], # 多条数据用英文逗号分割
|
|
|
+ }
|
|
|
+ try:
|
|
|
+ response = requests.post(url=conf['media_url'], params=post_data)
|
|
|
+ media_id = response.json()['data']
|
|
|
+ media_info = requests.get(url=conf['select_media_url'], params={'uid': media_id}, verify=False).json()[
|
|
|
+ 'content']
|
|
|
+ except Exception as e:
|
|
|
+ logging.warning(f'创建账户:{spider_link},失败,原因:{e}')
|
|
|
+ fail_list.append(author_url)
|
|
|
+ continue
|
|
|
|
|
|
- data = dict(
|
|
|
- spider_link=author_url,
|
|
|
- media_id=media_id,
|
|
|
- media_name=media_info['longvideoNickName'] if media_info['longvideoNickName'] else media_info['nickName'],
|
|
|
- source=source,
|
|
|
- task_type=task_type,
|
|
|
- applets_status=applets_status,
|
|
|
- app_status=app_status,
|
|
|
- user_tag=user_tags,
|
|
|
- user_content_tag=user_content_tags,
|
|
|
- insert_time=int(time.time()),
|
|
|
- update_time=int(time.time()),
|
|
|
- create_user_time=now_time,
|
|
|
- mode_name_str=task['mode_name_str'],
|
|
|
- mode_board_str=task['mode_board_str'],
|
|
|
- content_category_str=task['content_category_str'],
|
|
|
- # mode_value_str=mode_value_str,
|
|
|
- task_id=task_id,
|
|
|
- media_main_url=conf['media_main_url'].format(media_id)
|
|
|
- )
|
|
|
- keys = ','.join(data.keys())
|
|
|
- values = ','.join(['%s'] * len(data))
|
|
|
- table = 'crawler_author_map'
|
|
|
- sql = f"""insert into {table}({keys}) VALUES({values})"""
|
|
|
- mysql_con.insert_values(sql, tuple(data.values()))
|
|
|
- uer_info = dict(
|
|
|
- outer_id=author_url,
|
|
|
- uid=media_id
|
|
|
- )
|
|
|
- success_list.append(uer_info)
|
|
|
+ data = dict(
|
|
|
+ spider_link=author_url,
|
|
|
+ media_id=media_id,
|
|
|
+ media_name=media_info['longvideoNickName'] if media_info['longvideoNickName'] else media_info[
|
|
|
+ 'nickName'],
|
|
|
+ source=source,
|
|
|
+ task_type=task_type,
|
|
|
+ applets_status=applets_status,
|
|
|
+ app_status=app_status,
|
|
|
+ user_tag=user_tags,
|
|
|
+ user_content_tag=user_content_tags,
|
|
|
+ insert_time=int(time.time()),
|
|
|
+ update_time=int(time.time()),
|
|
|
+ create_user_time=now_time,
|
|
|
+ mode_name_str=task['mode_name_str'],
|
|
|
+ mode_board_str=task['mode_board_str'],
|
|
|
+ content_category_str=task['content_category_str'],
|
|
|
+ # mode_value_str=mode_value_str,
|
|
|
+ task_id=task_id,
|
|
|
+ media_main_url=conf['media_main_url'].format(media_id)
|
|
|
+ )
|
|
|
+ keys = ','.join(data.keys())
|
|
|
+ values = ','.join(['%s'] * len(data))
|
|
|
+ table = 'crawler_author_map'
|
|
|
+ sql = f"""insert into {table}({keys}) VALUES({values})"""
|
|
|
+ mysql_con.insert_values(sql, tuple(data.values()))
|
|
|
+ uer_info = dict(
|
|
|
+ outer_id=author_url,
|
|
|
+ uid=media_id
|
|
|
+ )
|
|
|
+ success_list.append(uer_info)
|
|
|
|
|
|
- return success_list, fail_list
|
|
|
+ return success_list, fail_list
|
|
|
|
|
|
|
|
|
@app.route("/v1/crawler/author/create", methods=["POST"])
|