weixinRelativeAccountCrawler.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. """
  2. @author: luojunhui
  3. 获取微信相关账号文章下载
  4. """
  5. import datetime
  6. import json
  7. import time
  8. from pandas import DataFrame
  9. from applications import PQMySQL, WeixinSpider, AlgApi, Functions
  10. from config import accountBaseInfo
  11. class weixinRelationAccountGoodArticles(object):
  12. """
  13. 优质账号抓取
  14. """
  15. pq_mysql_client = PQMySQL()
  16. wx_spider = WeixinSpider()
  17. function = Functions()
  18. spider_client = PQMySQL()
  19. nlp = AlgApi()
  20. @classmethod
  21. def findInnerAccount(cls):
  22. """
  23. 找出站内的账号
  24. :return:
  25. """
  26. id_set = set()
  27. for key in accountBaseInfo:
  28. gh_id = key[:-2]
  29. id_set.add(gh_id)
  30. return list(id_set)
  31. @classmethod
  32. def initAccount(cls, gh_id, account_name):
  33. """
  34. 初始化账号
  35. :param gh_id:
  36. :param account_name:
  37. :return:
  38. """
  39. for index in [i for i in range(1, 9)]:
  40. insert_sql = f"""
  41. INSERT INTO long_articles_accounts
  42. (gh_id, account_source, account_name, account_position, account_category, whether_inner_account, is_using)
  43. values
  44. (%s, %s, %s, %s, %s, %s, %s);
  45. """
  46. try:
  47. cls.pq_mysql_client.update(
  48. sql=insert_sql,
  49. params=(gh_id, "weixin", account_name, index, "association", 0, 1),
  50. )
  51. except Exception as e:
  52. print(e)
  53. print("账号初始化完成")
  54. @classmethod
  55. def putIntoAssociationGraph(cls, gh_id, account_name, source_title, source_account):
  56. """
  57. 将账号加入到联想表中
  58. :param gh_id: 联想账号id
  59. :param account_name: 联想账号名称
  60. :param source_title: 源标题
  61. :param source_account: 源账号
  62. :return:
  63. """
  64. insert_sql = f"""
  65. INSERT INTO long_articles_assiciation_accounts
  66. (account_outside_id, account_name, source_article_title, source_account, association_time, is_using)
  67. values
  68. (%s, %s, %s, %s, %s, %s);
  69. """
  70. try:
  71. cls.pq_mysql_client.update(
  72. sql=insert_sql,
  73. params=(
  74. gh_id,
  75. account_name,
  76. source_title,
  77. source_account,
  78. datetime.datetime.now().__str__(),
  79. 1,
  80. ),
  81. )
  82. except Exception as e:
  83. print(e)
  84. @classmethod
  85. def getEachAccountArticle(cls, account_id):
  86. """
  87. 获取每个账号的好文章
  88. :return:
  89. """
  90. select_sql = f"""
  91. SELECT title, Type, updateTime, ItemIndex, show_view_count
  92. FROM official_articles_v2
  93. WHERE ghId = '{account_id}';
  94. """
  95. result = cls.pq_mysql_client.select(select_sql)
  96. return DataFrame(
  97. result,
  98. columns=["title", "Type", "updateTime", "ItemIndex", "show_view_count"],
  99. )
  100. @classmethod
  101. def filterGoodArticle(cls, article_data_frame):
  102. """
  103. 获取好的文章
  104. :param article_data_frame:
  105. :return:
  106. """
  107. avg_view = article_data_frame["show_view_count"].mean()
  108. good_articles = article_data_frame[
  109. (article_data_frame["show_view_count"]) > avg_view * 1.1
  110. ]
  111. return good_articles["title"].values.tolist()
  112. @classmethod
  113. def searchGoodArticlesAccounts(cls, source_account, source_title, base_score=None):
  114. """
  115. 通过标题搜索文章
  116. :return:
  117. """
  118. response = cls.wx_spider.search_articles(source_title)
  119. article_list = response["data"]["data"]
  120. if article_list:
  121. title_list = [i["title"] for i in article_list]
  122. title_score_list = cls.nlp.getScoreList(
  123. accountName=source_account, title_list=title_list
  124. )[source_account]["score_list"]
  125. account_list = []
  126. for index, score in enumerate(title_score_list):
  127. # if score > base_score:
  128. article_obj = article_list[index]
  129. account_info = cls.wx_spider.get_account_by_url(
  130. content_url=article_obj["url"]
  131. )
  132. obj = [article_obj["title"], account_info]
  133. account_list.append(obj)
  134. return account_list
  135. else:
  136. return []
  137. @classmethod
  138. def insertIntoDataBase(cls, gh_id, article_list):
  139. """
  140. 将数据插入数据库
  141. :return:
  142. """
  143. for article_obj in article_list:
  144. detail_article_list = article_obj["AppMsg"]["DetailInfo"]
  145. for obj in detail_article_list:
  146. try:
  147. show_stat = cls.function.show_desc_to_sta(obj["ShowDesc"])
  148. show_view_count = show_stat.get("show_view_count", 0)
  149. show_like_count = show_stat.get("show_like_count", 0)
  150. insert_sql = f"""
  151. insert into crawler_meta_article
  152. (platform, mode, category, out_account_id, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, status, unique_index)
  153. VALUES
  154. (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
  155. """
  156. cls.spider_client.update(
  157. sql=insert_sql,
  158. params=(
  159. "weixin",
  160. "account",
  161. "association",
  162. gh_id,
  163. obj["Title"],
  164. obj["ContentUrl"],
  165. show_view_count,
  166. show_like_count,
  167. obj["Digest"],
  168. obj["send_time"],
  169. int(time.time()),
  170. 1,
  171. cls.function.generateGzhId(obj["ContentUrl"]),
  172. ),
  173. )
  174. except Exception as e:
  175. print(e)
  176. # @classmethod
  177. # def searchResultFilter(cls, filter_type, info):
  178. # """
  179. # 搜索结果过滤
  180. # :param info: 待过滤信息
  181. # :param filter_type: 过滤类型,account表示账号过滤, article表示文章过滤
  182. # :return: 过滤后的结果
  183. # """
  184. # match filter_type:
  185. # case "account":
  186. # return account
  187. if __name__ == "__main__":
  188. weixin = weixinRelationAccountGoodArticles()
  189. # 获取内部账号
  190. inner_account_list = weixin.findInnerAccount()
  191. for source_account in inner_account_list[:1]:
  192. accountArticlesDataFrame = weixin.getEachAccountArticle(
  193. account_id=source_account
  194. )
  195. goodArticles = weixin.filterGoodArticle(accountArticlesDataFrame)
  196. for title in goodArticles:
  197. account_list = weixin.searchGoodArticlesAccounts(
  198. source_account=source_account, source_title=title
  199. )
  200. print(title)
  201. print(source_account)
  202. for associated_account in account_list:
  203. source_title = associated_account[0]
  204. associated_account_info = associated_account[1]
  205. account_name = associated_account_info["data"]["data"]["account_name"]
  206. gh_id = associated_account_info["data"]["data"]["wx_gh"]
  207. if '新闻' in account_name:
  208. continue
  209. elif '央视' in account_name:
  210. continue
  211. else:
  212. # 初始化账号
  213. weixin.initAccount(gh_id=gh_id, account_name=account_name)
  214. weixin.putIntoAssociationGraph(
  215. gh_id=gh_id,
  216. account_name=account_name,
  217. source_account=source_account,
  218. source_title=title
  219. )