|
@@ -7,8 +7,14 @@ import time
|
|
|
|
|
|
from tqdm import tqdm
|
|
from tqdm import tqdm
|
|
|
|
|
|
-from applications import WeixinSpider, Functions, DeNetMysql, longArticlesMySQL
|
|
|
|
|
|
+from applications import WeixinSpider, Functions, longArticlesMySQL
|
|
|
|
|
|
|
|
+# 常量
|
|
|
|
+ACCOUNT_GOOD_STATUS = 1
|
|
|
|
+DEFAULT_VIEW_COUNT = 0
|
|
|
|
+DEFAULT_LIKE_COUNT = 0
|
|
|
|
+DEFAULT_ARTICLE_STATUS = 1
|
|
|
|
+DEFAULT_TIMESTAMP = 1704038400
|
|
|
|
|
|
class weixinCategory(object):
|
|
class weixinCategory(object):
|
|
"""
|
|
"""
|
|
@@ -16,12 +22,11 @@ class weixinCategory(object):
|
|
"""
|
|
"""
|
|
|
|
|
|
def __init__(self):
|
|
def __init__(self):
|
|
- self.db_client_lam = longArticlesMySQL
|
|
|
|
- self.db_client_dt = DeNetMysql()
|
|
|
|
|
|
+ self.db_client_lam = longArticlesMySQL()
|
|
self.spider = WeixinSpider()
|
|
self.spider = WeixinSpider()
|
|
self.function = Functions()
|
|
self.function = Functions()
|
|
|
|
|
|
- def getAccountList(self, account_category):
|
|
|
|
|
|
+ def get_account_list(self, account_category):
|
|
"""
|
|
"""
|
|
获取账号
|
|
获取账号
|
|
:param account_category 品类
|
|
:param account_category 品类
|
|
@@ -30,7 +35,7 @@ class weixinCategory(object):
|
|
sql = f"""
|
|
sql = f"""
|
|
select gh_id, account_source, account_name, account_category, latest_update_time
|
|
select gh_id, account_source, account_name, account_category, latest_update_time
|
|
from long_articles_accounts
|
|
from long_articles_accounts
|
|
- where account_category = '{account_category}' and is_using = 1;
|
|
|
|
|
|
+ where account_category = '{account_category}' and is_using = {ACCOUNT_GOOD_STATUS};
|
|
"""
|
|
"""
|
|
account_tuple = self.db_client_lam.select(sql)
|
|
account_tuple = self.db_client_lam.select(sql)
|
|
result = [
|
|
result = [
|
|
@@ -45,7 +50,7 @@ class weixinCategory(object):
|
|
]
|
|
]
|
|
return result
|
|
return result
|
|
|
|
|
|
- def updateDataIntoMysql(self, gh_id, category, article_list):
|
|
|
|
|
|
+ def insert_data_into_db(self, gh_id, category, article_list):
|
|
"""
|
|
"""
|
|
将数据更新到数据库
|
|
将数据更新到数据库
|
|
:return:
|
|
:return:
|
|
@@ -55,15 +60,15 @@ class weixinCategory(object):
|
|
for obj in detail_article_list:
|
|
for obj in detail_article_list:
|
|
try:
|
|
try:
|
|
show_stat = self.function.show_desc_to_sta(obj["ShowDesc"])
|
|
show_stat = self.function.show_desc_to_sta(obj["ShowDesc"])
|
|
- show_view_count = show_stat.get("show_view_count", 0)
|
|
|
|
- show_like_count = show_stat.get("show_like_count", 0)
|
|
|
|
|
|
+ show_view_count = show_stat.get("show_view_count", DEFAULT_VIEW_COUNT)
|
|
|
|
+ show_like_count = show_stat.get("show_like_count", DEFAULT_LIKE_COUNT)
|
|
insert_sql = f"""
|
|
insert_sql = f"""
|
|
insert into crawler_meta_article
|
|
insert into crawler_meta_article
|
|
(platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, status, unique_index)
|
|
(platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, status, unique_index)
|
|
VALUES
|
|
VALUES
|
|
(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
|
|
(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
|
|
"""
|
|
"""
|
|
- self.db_client_dt.update(
|
|
|
|
|
|
+ self.db_client_lam.update(
|
|
sql=insert_sql,
|
|
sql=insert_sql,
|
|
params=(
|
|
params=(
|
|
"weixin",
|
|
"weixin",
|
|
@@ -78,14 +83,14 @@ class weixinCategory(object):
|
|
obj["Digest"],
|
|
obj["Digest"],
|
|
obj["send_time"],
|
|
obj["send_time"],
|
|
int(time.time()),
|
|
int(time.time()),
|
|
- 1,
|
|
|
|
|
|
+ DEFAULT_ARTICLE_STATUS,
|
|
self.function.generateGzhId(obj["ContentUrl"]),
|
|
self.function.generateGzhId(obj["ContentUrl"]),
|
|
),
|
|
),
|
|
)
|
|
)
|
|
except Exception as e:
|
|
except Exception as e:
|
|
print(e)
|
|
print(e)
|
|
|
|
|
|
- def updateLatestAccountTimeStamp(self, gh_id):
|
|
|
|
|
|
+ def update_latest_account_timestamp(self, gh_id):
|
|
"""
|
|
"""
|
|
更新账号的最新时间戳
|
|
更新账号的最新时间戳
|
|
:return:
|
|
:return:
|
|
@@ -96,7 +101,7 @@ class weixinCategory(object):
|
|
WHERE out_account_id = '{gh_id}'
|
|
WHERE out_account_id = '{gh_id}'
|
|
ORDER BY publish_time DESC LIMIT 1;
|
|
ORDER BY publish_time DESC LIMIT 1;
|
|
"""
|
|
"""
|
|
- result = self.db_client_dt.select(select_sql)
|
|
|
|
|
|
+ result = self.db_client_lam.select(select_sql)
|
|
time_stamp = result[0][0]
|
|
time_stamp = result[0][0]
|
|
dt_str = self.function.time_stamp_to_str(time_stamp)
|
|
dt_str = self.function.time_stamp_to_str(time_stamp)
|
|
update_sql = f"""
|
|
update_sql = f"""
|
|
@@ -106,7 +111,7 @@ class weixinCategory(object):
|
|
"""
|
|
"""
|
|
self.db_client_lam.update(sql=update_sql, params=(dt_str, gh_id))
|
|
self.db_client_lam.update(sql=update_sql, params=(dt_str, gh_id))
|
|
|
|
|
|
- def updateEachAccountArticles(self, gh_id, category, latest_time_stamp, index=None):
|
|
|
|
|
|
+ def update_each_account(self, gh_id, category, latest_time_stamp, index=None):
|
|
"""
|
|
"""
|
|
更新账号文章
|
|
更新账号文章
|
|
:return:
|
|
:return:
|
|
@@ -115,13 +120,13 @@ class weixinCategory(object):
|
|
msg_list = response.get("data", {}).get("data")
|
|
msg_list = response.get("data", {}).get("data")
|
|
if msg_list:
|
|
if msg_list:
|
|
last_article_in_this_msg = msg_list[-1]
|
|
last_article_in_this_msg = msg_list[-1]
|
|
- self.updateDataIntoMysql(
|
|
|
|
|
|
+ self.insert_data_into_db(
|
|
gh_id=gh_id, category=category, article_list=msg_list
|
|
gh_id=gh_id, category=category, article_list=msg_list
|
|
)
|
|
)
|
|
last_time_stamp_in_this_msg = last_article_in_this_msg["AppMsg"]["BaseInfo"]["UpdateTime"]
|
|
last_time_stamp_in_this_msg = last_article_in_this_msg["AppMsg"]["BaseInfo"]["UpdateTime"]
|
|
if latest_time_stamp < last_time_stamp_in_this_msg:
|
|
if latest_time_stamp < last_time_stamp_in_this_msg:
|
|
next_cursor = response["data"]["next_cursor"]
|
|
next_cursor = response["data"]["next_cursor"]
|
|
- return self.updateEachAccountArticles(
|
|
|
|
|
|
+ return self.update_each_account(
|
|
gh_id=gh_id,
|
|
gh_id=gh_id,
|
|
latest_time_stamp=latest_time_stamp,
|
|
latest_time_stamp=latest_time_stamp,
|
|
category=category,
|
|
category=category,
|
|
@@ -129,32 +134,32 @@ class weixinCategory(object):
|
|
)
|
|
)
|
|
else:
|
|
else:
|
|
# 更新最近抓取时间
|
|
# 更新最近抓取时间
|
|
- self.updateLatestAccountTimeStamp(gh_id=gh_id)
|
|
|
|
|
|
+ self.update_latest_account_timestamp(gh_id=gh_id)
|
|
print("账号时间更新成功")
|
|
print("账号时间更新成功")
|
|
else:
|
|
else:
|
|
print("No more data")
|
|
print("No more data")
|
|
|
|
|
|
|
|
+ def deal(self, category_list):
|
|
|
|
+ """
|
|
|
|
|
|
-if __name__ == "__main__":
|
|
|
|
- wxCategory = weixinCategory()
|
|
|
|
- category_list = [
|
|
|
|
- 'daily-account-mining'
|
|
|
|
- ]
|
|
|
|
- for category in category_list:
|
|
|
|
- account_list = wxCategory.getAccountList(category)
|
|
|
|
- for account in tqdm(account_list):
|
|
|
|
- try:
|
|
|
|
- gh_id = account['gh_id']
|
|
|
|
- category = account['category']
|
|
|
|
|
|
+ :param category_list:
|
|
|
|
+ :return:
|
|
|
|
+ """
|
|
|
|
+ for category in category_list:
|
|
|
|
+ account_list = self.get_account_list(category)
|
|
|
|
+ for account in tqdm(account_list):
|
|
try:
|
|
try:
|
|
- timestamp = int(account['latest_timestamp'].timestamp())
|
|
|
|
|
|
+ gh_id = account['gh_id']
|
|
|
|
+ category = account['category']
|
|
|
|
+ try:
|
|
|
|
+ timestamp = int(account['latest_timestamp'].timestamp())
|
|
|
|
+ except Exception as e:
|
|
|
|
+ timestamp = DEFAULT_TIMESTAMP
|
|
|
|
+ self.update_each_account(
|
|
|
|
+ gh_id=gh_id,
|
|
|
|
+ category=category,
|
|
|
|
+ latest_time_stamp=timestamp
|
|
|
|
+ )
|
|
|
|
+ print("success")
|
|
except Exception as e:
|
|
except Exception as e:
|
|
- timestamp = 1704038400
|
|
|
|
- wxCategory.updateEachAccountArticles(
|
|
|
|
- gh_id=gh_id,
|
|
|
|
- category=category,
|
|
|
|
- latest_time_stamp=timestamp
|
|
|
|
- )
|
|
|
|
- print("success")
|
|
|
|
- except Exception as e:
|
|
|
|
- print("fail because of {}".format(e))
|
|
|
|
|
|
+ print("fail because of {}".format(e))
|