12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273 |
- """
- @author: luojunhui
- 计算账号的阅读均值倍数
- """
- import json
- from pandas import DataFrame
- from tqdm import tqdm
- from applications import DeNetMysql
- from applications import AIDTApi
- D = DeNetMysql()
- def get_account_avg():
- """
- 获取账号
- :return:
- """
- with open("/Users/luojunhui/cyber/LongArticlesJob/dev/avg_new_health.json", encoding="utf-8") as f:
- avg_dict = json.loads(f.read())
- account_position_list = list(avg_dict.keys())
- L = []
- for account in tqdm(account_position_list):
- gh_id = account[:-2]
- index = int(account[-1:])
- select_sql = f"""
- select title, read_cnt, link from crawler_meta_article
- where out_account_id = '{gh_id}' and article_index = {index} and status = 1;
- """
- result_list = D.select(select_sql)
- try:
- avg_read = avg_dict[account]
- for i in result_list:
- title, read_cnt, link = i
- avg_score = read_cnt / avg_read
- temp = [title, link, read_cnt, avg_score]
- L.append(temp)
- except:
- continue
- sl = sorted(L, reverse=True, key=lambda x: x[3])
- a = 0
- b = 0
- LL = []
- for line in sl:
- title = line[0]
- read_cnt = line[2]
- if "农历" in title or '太极' in title or "节" in title or line[3] < 1.3 or len(title) < 15 or read_cnt < 5000:
- a += 1
- continue
- else:
- b += 1
- print(line)
- LL.append(line)
- print(a)
- print(b)
- df = DataFrame(LL, columns=["title", "link", "read", "read_avg"])
- df.to_excel("health_2.xlsx", index=False)
- # url_list = [i[1] for i in LL[3:]]
- # try:
- # AIDTApi().updateArticleIntoCrawlerPlan(
- # plan_id=None,
- # plan_name="历史冷启-0816-new",
- # plan_tag="autoArticlePoolLevel1",
- # url_list=url_list
- # )
- # except Exception as e:
- # print("error--{}".format(e))
- get_account_avg()
|