Explorar o código

update 创建任务时对名单进行校验

lierqiang %!s(int64=2) %!d(string=hai) anos
pai
achega
0af981963c
Modificáronse 1 ficheiros con 108 adicións e 77 borrados
  1. 108 77
      server/conf_task.py

+ 108 - 77
server/conf_task.py

@@ -122,7 +122,8 @@ def getMediaInfo():
     for task_info in result:
         media_id = task_info['media_id']
 
-        media_info = requests.get(url=conf['select_media_url'], params={'uid': media_id},verify=False).json()['content']
+        media_info = requests.get(url=conf['select_media_url'], params={'uid': media_id}, verify=False).json()[
+            'content']
         media_name = media_info['longvideoNickName'] if media_info['longvideoNickName'] else media_info['nickName']
         nick_name = task_info['nick_name']
         spider_link = task_info['spider_link']
@@ -222,7 +223,7 @@ def get_repeat_list():
         if result:
             repeat_list.append(spider_link)
     if repeat_list:
-        return jsonify({'code': 400, 'message': '名单重复', 'repeat_list': repeat_list})
+        return jsonify({'code': 200, 'message': '抓取名单校验通过', 'repeat_list': repeat_list})
     else:
 
         return jsonify({'code': 200, 'message': '抓取名单校验通过', 'repeat_list': repeat_list})
@@ -487,85 +488,115 @@ def create_uid(task, task_id, spider_link):
         content_tag_list.append(tag['tagName'])
     user_tags = ','.join(str(i) for i in tag_name_list)
     user_content_tags = ','.join(str(i) for i in content_tag_list)
-
+    repeat_list = []
+    create_user_list = []
     for author_url in spider_link:
-        now_time = int(time.time())
-        time_array = time.localtime(now_time)
-        str_time = time.strftime("%Y%m%d", time_array)
-        # 生成创建用户的tag
-
-        tags = ''
-        if task['task_type'] == 'author':
-            spider_task = '账号'
-            tags_list = ['spider', spider_task, spider_platform, user_tags, task['content_category_str'], str_time]
-
-        elif task['task_type'] == 'search':
-            spider_task = '搜索'
-            tags_list = ['spider', spider_task, spider_platform, user_tags, author_url, task['content_category_str'], str_time]
-        elif task['task_type'] == 'board':
-            spider_task = '榜单'
-            mode_tags = task['mode_board_str']
-
-            tags_list = ['spider', spider_task, spider_platform, user_tags, mode_tags, task['content_category_str'], str_time]
-        elif task['task_type'] == 'recommend':
-            spider_task = '推荐'
-            mode_tags = task['mode_name_str'] + task['mode_board_str']
-            tags_list = ['spider', spider_task, spider_platform, user_tags, mode_tags, task['content_category_str'], str_time]
+        repeat_sql = f'select * from crawler_author_map where spider_link="{author_url}"'
+        result = mysql_con.get_values(repeat_sql)
+        if result:
+            old_task_id = result[0]['task_id']
+            is_del = result[0]['is_del']
+            if task_id == old_task_id:
+                if is_del:
+                    continue
+                else:
+                    up_sql = f'update crawler_author_map set is_del=1 where spider_link="{author_url}"'
+                    mysql_con.update_values(up_sql)
+            else:
+                if is_del:
+                    repeat_list.append(author_url)
+                else:
+                    up_sql = f'update crawler_author_map set task_id={task_id},is_del=1 where spider_link="{author_url}"'
+                    mysql_con.update_values(up_sql)
         else:
-            tags_list = ['spider', spider_platform, user_tags, task['content_category_str'], str_time]
-        for v in tags_list:
-            if v:
-                tags += str(v) + ','
-        post_data = {
-            # 'count': 1,     # (必须)账号个数:传1
-            # 'accountType': 4,   # (必须)账号类型 :传 4 app虚拟账号
-            'pwd': '',  # 密码 默认 12346
-            'nickName': '',  # 昵称  默认 vuser......
-            'avatarUrl': '',
-            # 头像Url  默认 http://weapppiccdn.yishihui.com/resources/images/pic_normal.png
-            'tagName': tags[:-1],  # 多条数据用英文逗号分割
-        }
-        try:
-            response = requests.post(url=conf['media_url'], params=post_data)
-            media_id = response.json()['data']
-            media_info = requests.get(url=conf['select_media_url'], params={'uid': media_id},verify=False).json()['content']
-        except Exception as e:
-            logging.warning(f'创建账户:{spider_link},失败,原因:{e}')
-            fail_list.append(author_url)
-            continue
+            create_user_list.append(author_url)
+    if repeat_list:
+        message = f'该任务和其他任务抓取名单重复:{repeat_list}'
+        return jsonify({'code': 400, 'message': message})
+    else:
+        for author_url in create_user_list:
+
+            now_time = int(time.time())
+            time_array = time.localtime(now_time)
+            str_time = time.strftime("%Y%m%d", time_array)
+            # 生成创建用户的tag
+            tags = ''
+            if task['task_type'] == 'author':
+                spider_task = '账号'
+                tags_list = ['spider', spider_task, spider_platform, user_tags, task['content_category_str'], str_time]
+
+            elif task['task_type'] == 'search':
+                spider_task = '搜索'
+                tags_list = ['spider', spider_task, spider_platform, user_tags, author_url,
+                             task['content_category_str'], str_time]
+            elif task['task_type'] == 'board':
+                spider_task = '榜单'
+                mode_tags = task['mode_board_str']
+
+                tags_list = ['spider', spider_task, spider_platform, user_tags, mode_tags, task['content_category_str'],
+                             str_time]
+            elif task['task_type'] == 'recommend':
+                spider_task = '推荐'
+                mode_tags = task['mode_name_str'] + task['mode_board_str']
+                tags_list = ['spider', spider_task, spider_platform, user_tags, mode_tags, task['content_category_str'],
+                             str_time]
+            else:
+                tags_list = ['spider', spider_platform, user_tags, task['content_category_str'], str_time]
+            for v in tags_list:
+                if v:
+                    tags += str(v) + ','
+            post_data = {
+                # 'count': 1,     # (必须)账号个数:传1
+                # 'accountType': 4,   # (必须)账号类型 :传 4 app虚拟账号
+                'pwd': '',  # 密码 默认 12346
+                'nickName': '',  # 昵称  默认 vuser......
+                'avatarUrl': '',
+                # 头像Url  默认 http://weapppiccdn.yishihui.com/resources/images/pic_normal.png
+                'tagName': tags[:-1],  # 多条数据用英文逗号分割
+            }
+            try:
+                response = requests.post(url=conf['media_url'], params=post_data)
+                media_id = response.json()['data']
+                media_info = requests.get(url=conf['select_media_url'], params={'uid': media_id}, verify=False).json()[
+                    'content']
+            except Exception as e:
+                logging.warning(f'创建账户:{spider_link},失败,原因:{e}')
+                fail_list.append(author_url)
+                continue
 
-        data = dict(
-            spider_link=author_url,
-            media_id=media_id,
-            media_name=media_info['longvideoNickName'] if media_info['longvideoNickName'] else media_info['nickName'],
-            source=source,
-            task_type=task_type,
-            applets_status=applets_status,
-            app_status=app_status,
-            user_tag=user_tags,
-            user_content_tag=user_content_tags,
-            insert_time=int(time.time()),
-            update_time=int(time.time()),
-            create_user_time=now_time,
-            mode_name_str=task['mode_name_str'],
-            mode_board_str=task['mode_board_str'],
-            content_category_str=task['content_category_str'],
-            # mode_value_str=mode_value_str,
-            task_id=task_id,
-            media_main_url=conf['media_main_url'].format(media_id)
-        )
-        keys = ','.join(data.keys())
-        values = ','.join(['%s'] * len(data))
-        table = 'crawler_author_map'
-        sql = f"""insert into {table}({keys}) VALUES({values})"""
-        mysql_con.insert_values(sql, tuple(data.values()))
-        uer_info = dict(
-            outer_id=author_url,
-            uid=media_id
-        )
-        success_list.append(uer_info)
+            data = dict(
+                spider_link=author_url,
+                media_id=media_id,
+                media_name=media_info['longvideoNickName'] if media_info['longvideoNickName'] else media_info[
+                    'nickName'],
+                source=source,
+                task_type=task_type,
+                applets_status=applets_status,
+                app_status=app_status,
+                user_tag=user_tags,
+                user_content_tag=user_content_tags,
+                insert_time=int(time.time()),
+                update_time=int(time.time()),
+                create_user_time=now_time,
+                mode_name_str=task['mode_name_str'],
+                mode_board_str=task['mode_board_str'],
+                content_category_str=task['content_category_str'],
+                # mode_value_str=mode_value_str,
+                task_id=task_id,
+                media_main_url=conf['media_main_url'].format(media_id)
+            )
+            keys = ','.join(data.keys())
+            values = ','.join(['%s'] * len(data))
+            table = 'crawler_author_map'
+            sql = f"""insert into {table}({keys}) VALUES({values})"""
+            mysql_con.insert_values(sql, tuple(data.values()))
+            uer_info = dict(
+                outer_id=author_url,
+                uid=media_id
+            )
+            success_list.append(uer_info)
 
-    return success_list, fail_list
+        return success_list, fail_list
 
 
 @app.route("/v1/crawler/author/create", methods=["POST"])