functions.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. """
  2. @author: luojunhui
  3. """
  4. import threading
  5. from datetime import datetime, timezone
  6. import hashlib
  7. import requests
  8. import pymysql
  9. class Functions(object):
  10. """
  11. functions class
  12. """
  13. @classmethod
  14. def getTitleScore(cls, title_list, account_name):
  15. """
  16. 标题打分
  17. :param title_list:
  18. :param account_name:
  19. :return:
  20. """
  21. url = "http://192.168.100.31:6060/score_list"
  22. body = {
  23. "account_nickname_list": [account_name],
  24. "text_list": title_list,
  25. "max_time": None,
  26. "min_time": None,
  27. "interest_type": "avg",
  28. "sim_type": "mean",
  29. "rate": 0.1
  30. }
  31. response = requests.post(url=url, headers={}, json=body).json()
  32. return response
  33. @classmethod
  34. def getTitleAccountScore(cls, title, account_list):
  35. """
  36. 标题打分
  37. :param title:
  38. :param account_list:
  39. :return:
  40. """
  41. url = "http://192.168.100.31:6060/score_list"
  42. body = {
  43. "account_nickname_list": account_list,
  44. "text_list": [title],
  45. "max_time": None,
  46. "min_time": None,
  47. "interest_type": "avg",
  48. "sim_type": "mean",
  49. "rate": 0.1
  50. }
  51. response = requests.post(url=url, headers={}, json=body).json()
  52. L = []
  53. for account in account_list:
  54. account_score = response[account]['score_list'][0]
  55. L.append([account, account_score])
  56. return L
  57. @classmethod
  58. def matchLinkByIdTuple(cls, channel_id_tuple):
  59. """
  60. Use channelContentId to match articleUrl
  61. :param channel_id_tuple:
  62. :return:
  63. """
  64. connection = pymysql.connect(
  65. host='rm-bp12k5fuh5zyx31d28o.mysql.rds.aliyuncs.com',
  66. port=3306,
  67. user='wx2023_ad',
  68. password='wx2023_adP@assword1234',
  69. db='adplatform',
  70. charset='utf8mb4'
  71. )
  72. sql = f"""select id, account_id, link, item_index, title from changwen_article where id in {channel_id_tuple};"""
  73. cursor = connection.cursor()
  74. cursor.execute(sql)
  75. article_link = cursor.fetchall()
  76. L = {}
  77. for line in article_link:
  78. key = line[0]
  79. value = {
  80. "gh_key": "{}_{}".format(line[1], line[3]),
  81. "url": line[2],
  82. "title": line[4]
  83. }
  84. L[key] = value
  85. return L
  86. @classmethod
  87. def TitleSimilarity(cls, title_list, target_title):
  88. """
  89. 计算标题相似度
  90. :return:
  91. """
  92. def title_sim_v2(title_a, title_b, thredhold=0.8):
  93. """
  94. :param title_a:
  95. :param title_b:
  96. :param thredhold:
  97. :return:
  98. """
  99. if len(title_a) < 1 or len(title_b) < 1:
  100. return False
  101. set_a = set(title_a)
  102. set_b = set(title_b)
  103. set_cross = set_a & set_b
  104. set_union = set_a | set_b
  105. if not set_union:
  106. return False
  107. min_len = max(min(len(set_a), len(set_b)), 1)
  108. rate = len(set_cross) / min_len
  109. if rate >= thredhold:
  110. return True
  111. else:
  112. return False
  113. for title in title_list:
  114. sim_score = title_sim_v2(target_title, title)
  115. if sim_score:
  116. return True
  117. return False
  118. @classmethod
  119. def show_desc_to_sta(cls, show_desc):
  120. """
  121. :return:
  122. """
  123. def decode_show_v(show_v):
  124. """
  125. :param show_v:
  126. :return:
  127. """
  128. foo = show_v.replace('千', 'e3').replace('万', 'e4').replace('亿', 'e8')
  129. foo = eval(foo)
  130. return int(foo)
  131. def decode_show_k(show_k):
  132. """
  133. :param show_k:
  134. :return:
  135. """
  136. this_dict = {
  137. '阅读': 'show_view_count', # 文章
  138. '看过': 'show_view_count', # 图文
  139. '观看': 'show_view_count', # 视频
  140. '赞': 'show_like_count',
  141. '付费': 'show_pay_count',
  142. '赞赏': 'show_zs_count',
  143. }
  144. if show_k not in this_dict:
  145. print(f'error from decode_show_k, show_k not found: {show_k}')
  146. return this_dict.get(show_k, 'show_unknown')
  147. show_desc = show_desc.replace('+', '')
  148. sta = {}
  149. for show_kv in show_desc.split('\u2004\u2005'):
  150. if not show_kv:
  151. continue
  152. show_k, show_v = show_kv.split('\u2006')
  153. k = decode_show_k(show_k)
  154. v = decode_show_v(show_v)
  155. sta[k] = v
  156. res = {
  157. 'show_view_count': sta.get('show_view_count', 0),
  158. 'show_like_count': sta.get('show_like_count', 0),
  159. 'show_pay_count': sta.get('show_pay_count', 0),
  160. 'show_zs_count': sta.get('show_zs_count', 0),
  161. }
  162. return res
  163. @classmethod
  164. def generateGzhId(cls, url):
  165. """
  166. generate url
  167. :param url:
  168. :return:
  169. """
  170. biz = url.split("biz=")[1].split("&")[0]
  171. idx = url.split("&idx=")[1].split("&")[0]
  172. sn = url.split("&sn=")[1].split("&")[0]
  173. url_bit = "{}-{}-{}".format(biz, idx, sn).encode()
  174. md5_hash = hashlib.md5()
  175. md5_hash.update(url_bit)
  176. md5_value = md5_hash.hexdigest()
  177. return md5_value
  178. @classmethod
  179. def time_stamp_to_str(cls, timestamp):
  180. """
  181. :param timestamp:
  182. """
  183. dt_object = datetime.utcfromtimestamp(timestamp).replace(tzinfo=timezone.utc).astimezone()
  184. date_string = dt_object.strftime('%Y-%m-%d %H:%M:%S')
  185. return date_string
  186. @classmethod
  187. def job_with_thread(cls, job_func):
  188. """
  189. 每个任务放到单个线程中
  190. :param job_func:
  191. :return:
  192. """
  193. job_thread = threading.Thread(target=job_func)
  194. job_thread.start()
  195. @classmethod
  196. def str_to_md5(cls, strings):
  197. """
  198. 字符串转化为 md5 值
  199. :param strings:
  200. :return:
  201. """
  202. # 将字符串转换为字节
  203. original_bytes = strings.encode('utf-8')
  204. # 创建一个md5 hash对象
  205. md5_hash = hashlib.md5()
  206. # 更新hash对象,传入原始字节
  207. md5_hash.update(original_bytes)
  208. # 获取16进制形式的MD5哈希值
  209. md5_value = md5_hash.hexdigest()
  210. return md5_value