|
@@ -10,6 +10,75 @@ from applications import longArticlesMySQL
|
|
lam = longArticlesMySQL()
|
|
lam = longArticlesMySQL()
|
|
|
|
|
|
|
|
|
|
|
|
+def read_rate_debias(row):
|
|
|
|
+ """
|
|
|
|
+ 阅读均值倍数通过头条消偏
|
|
|
|
+ :param row:
|
|
|
|
+ :return:
|
|
|
|
+ """
|
|
|
|
+ if row["位置"] != 1:
|
|
|
|
+ return row["阅读量"] / (
|
|
|
|
+ max(1.0, row["头条阅读量"] / row["头条阅读均值"]) * row["阅读均值"]
|
|
|
|
+ )
|
|
|
|
+ else:
|
|
|
|
+ return row["阅读均值倍数"]
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def filter_same_title(df):
|
|
|
|
+ """
|
|
|
|
+ 通过标题过滤 dataframe
|
|
|
|
+ :param df:
|
|
|
|
+ :return:
|
|
|
|
+ """
|
|
|
|
+
|
|
|
|
+ def title_sim_v2_by_title_list(title_target, title_list, threshold=0.75):
|
|
|
|
+ """
|
|
|
|
+ :param title_target:
|
|
|
|
+ :param title_list:
|
|
|
|
+ :param threshold:
|
|
|
|
+ :return:
|
|
|
|
+ """
|
|
|
|
+
|
|
|
|
+ def title_sim_v2(title_a, title_b, threshold=0.75):
|
|
|
|
+ """
|
|
|
|
+ 标题相似度
|
|
|
|
+ :param title_a:
|
|
|
|
+ :param title_b:
|
|
|
|
+ :param threshold:
|
|
|
|
+ :return:
|
|
|
|
+ """
|
|
|
|
+ if len(title_a) < 1 or len(title_b) < 1:
|
|
|
|
+ return False
|
|
|
|
+ set_a = set(title_a)
|
|
|
|
+ set_b = set(title_b)
|
|
|
|
+ set_cross = set_a & set_b
|
|
|
|
+ set_union = set_a | set_b
|
|
|
|
+ if not set_union:
|
|
|
|
+ return False
|
|
|
|
+ min_len = max(min(len(set_a), len(set_b)), 1)
|
|
|
|
+ rate = len(set_cross) / min_len
|
|
|
|
+ if rate >= threshold:
|
|
|
|
+ return True
|
|
|
|
+ else:
|
|
|
|
+ return False
|
|
|
|
+
|
|
|
|
+ for title in title_list:
|
|
|
|
+ sim_score = title_sim_v2(title_target, title, threshold=threshold)
|
|
|
|
+ if sim_score:
|
|
|
|
+ return title
|
|
|
|
+ return False
|
|
|
|
+
|
|
|
|
+ visited_titles = []
|
|
|
|
+ data = []
|
|
|
|
+ for x in df.to_dict(orient='records'):
|
|
|
|
+ title = x['标题']
|
|
|
|
+ if title_sim_v2_by_title_list(title, visited_titles):
|
|
|
|
+ continue
|
|
|
|
+ visited_titles.append(title)
|
|
|
|
+ data.append(x)
|
|
|
|
+ return DataFrame(data)
|
|
|
|
+
|
|
|
|
+
|
|
class articleLevelUp(object):
|
|
class articleLevelUp(object):
|
|
"""
|
|
"""
|
|
文章晋级
|
|
文章晋级
|
|
@@ -167,24 +236,9 @@ class articleLevelUp(object):
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
- @classmethod
|
|
|
|
- def readRateDebias(cls, row):
|
|
|
|
|
|
+ def get_base_data(self):
|
|
"""
|
|
"""
|
|
- 阅读均值倍数通过头条消偏
|
|
|
|
- :param row:
|
|
|
|
- :return:
|
|
|
|
- """
|
|
|
|
- if row["位置"] != 1:
|
|
|
|
- return row["阅读量"] / (
|
|
|
|
- max(1.0, row["头条阅读量"] / row["头条阅读均值"]) * row["阅读均值"]
|
|
|
|
- )
|
|
|
|
- else:
|
|
|
|
- return row["阅读均值倍数"]
|
|
|
|
-
|
|
|
|
- @classmethod
|
|
|
|
- def getBaseData(cls):
|
|
|
|
- """
|
|
|
|
-
|
|
|
|
|
|
+ 从数据文章数据
|
|
:return:
|
|
:return:
|
|
"""
|
|
"""
|
|
sql = f"""
|
|
sql = f"""
|
|
@@ -194,24 +248,23 @@ class articleLevelUp(object):
|
|
datastat_sort_strategy;
|
|
datastat_sort_strategy;
|
|
"""
|
|
"""
|
|
response = lam.select(sql)
|
|
response = lam.select(sql)
|
|
- df = DataFrame(response, columns=cls.columns)
|
|
|
|
|
|
+ df = DataFrame(response, columns=self.columns)
|
|
df = df.sort_values(by=["阅读均值倍数"], ascending=[False]).reset_index(drop=True)
|
|
df = df.sort_values(by=["阅读均值倍数"], ascending=[False]).reset_index(drop=True)
|
|
df = df[df["粉丝量"] > 10000].reset_index(drop=True)
|
|
df = df[df["粉丝量"] > 10000].reset_index(drop=True)
|
|
return df
|
|
return df
|
|
|
|
|
|
- @classmethod
|
|
|
|
- def analysisDF(cls, indexList):
|
|
|
|
|
|
+ def analysis_data(self, index_list):
|
|
"""
|
|
"""
|
|
分析 dataframe 中数据占比
|
|
分析 dataframe 中数据占比
|
|
:return:
|
|
:return:
|
|
"""
|
|
"""
|
|
- DF = cls.getBaseData()
|
|
|
|
- DF = DF[(DF["位置"].isin(indexList))]
|
|
|
|
- print(len(DF))
|
|
|
|
- avg_read_times = DF['阅读均值倍数'].sort_values(ascending=False)
|
|
|
|
- read_rate = DF['阅读率'].sort_values(ascending=False)
|
|
|
|
- mini_open_rate = DF['小程序打开率'].sort_values(ascending=False)
|
|
|
|
- t_plus_0_fission = DF['T+0裂变率'].sort_values(ascending=False)
|
|
|
|
|
|
+ df = self.get_base_data()
|
|
|
|
+ # 筛选指定位置的文章
|
|
|
|
+ df = df[(df["位置"].isin(index_list))]
|
|
|
|
+ avg_read_times = df['阅读均值倍数'].sort_values(ascending=False)
|
|
|
|
+ read_rate = df['阅读率'].sort_values(ascending=False)
|
|
|
|
+ mini_open_rate = df['小程序打开率'].sort_values(ascending=False)
|
|
|
|
+ t_plus_0_fission = df['T+0裂变率'].sort_values(ascending=False)
|
|
detail = {
|
|
detail = {
|
|
"阅读均值倍数": {
|
|
"阅读均值倍数": {
|
|
"mean": avg_read_times.mean(),
|
|
"mean": avg_read_times.mean(),
|
|
@@ -260,37 +313,38 @@ class articleLevelUp(object):
|
|
}
|
|
}
|
|
print(json.dumps(detail, ensure_ascii=False, indent=4))
|
|
print(json.dumps(detail, ensure_ascii=False, indent=4))
|
|
|
|
|
|
- @classmethod
|
|
|
|
- def upLevel38To2(cls):
|
|
|
|
|
|
+ def find_good_articles(self, df, pool_level, index_list, read_count, read_avg_times):
|
|
"""
|
|
"""
|
|
|
|
+ 获取已经发布文章中,认为是质量好的文章
|
|
|
|
+ :param read_avg_times: 阅读均值倍数
|
|
|
|
+ :param read_count: 阅读量
|
|
|
|
+ :param df: 查询出来的 df
|
|
|
|
+ :param pool_level: 流量池层级
|
|
|
|
+ :param index_list: 文章位置
|
|
:return:
|
|
:return:
|
|
"""
|
|
"""
|
|
- dataThreeToEight = cls.getBaseData()
|
|
|
|
- dataThreeToEight = dataThreeToEight[dataThreeToEight['位置'].isin([3, 4, 5, 6, 7, 8])]
|
|
|
|
- filter_data = dataThreeToEight[
|
|
|
|
- (dataThreeToEight['T+0裂变率'] > cls.statMapThreeToEight['T+0裂变率']['95%'])
|
|
|
|
- & (dataThreeToEight['阅读均值倍数'] > cls.statMapThreeToEight['阅读均值倍数']['80%'])
|
|
|
|
|
|
+ good_articles = df[
|
|
|
|
+ (df['阅读量'] >= read_count) &
|
|
|
|
+ (df['阅读均值倍数'] >= read_avg_times) &
|
|
|
|
+ (df['位置'].isin(index_list))
|
|
]
|
|
]
|
|
- return filter_data
|
|
|
|
-
|
|
|
|
- @classmethod
|
|
|
|
- def upLevel2To1(cls):
|
|
|
|
- """
|
|
|
|
- :return:
|
|
|
|
- """
|
|
|
|
- dataThreeToEight = cls.getBaseData()
|
|
|
|
- dataThreeToEight = dataThreeToEight[dataThreeToEight['位置'].isin([2])]
|
|
|
|
- filter_data = dataThreeToEight[
|
|
|
|
- (dataThreeToEight['T+0裂变率'] > cls.statMapThreeToEight['T+0裂变率']['90%'])
|
|
|
|
- & (dataThreeToEight['阅读均值倍数'] > cls.statMapThreeToEight['阅读均值倍数']['90%'])
|
|
|
|
- ]
|
|
|
|
- return filter_data
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-U = articleLevelUp()
|
|
|
|
-U.analysisDF(indexList=[1])
|
|
|
|
-f_d = U.upLevel2To1()
|
|
|
|
-for line in list(zip(f_d['标题'], f_d['链接'])):
|
|
|
|
- print(line[0])
|
|
|
|
- print(line[1])
|
|
|
|
- print("\n")
|
|
|
|
|
|
+ distinct_good_articles = filter_same_title(good_articles)
|
|
|
|
+ sorted_distinct_good_articles = distinct_good_articles.sort_values(by=['发布日期'],
|
|
|
|
+ ascending=[False]).reset_index(drop=True)
|
|
|
|
+ print(
|
|
|
|
+ "流量池 level: {} 中,去重后一共 {} 篇优质文章".format(pool_level, len(sorted_distinct_good_articles.index)))
|
|
|
|
+ url_list = []
|
|
|
|
+ title_list = []
|
|
|
|
+ for x in sorted_distinct_good_articles.to_dict(orient='records'):
|
|
|
|
+ url_list.append(x['链接'])
|
|
|
|
+ title_list.append(x['标题'])
|
|
|
|
+ add_url_list_to_account(
|
|
|
|
+ account_nickname,
|
|
|
|
+ url_list,
|
|
|
|
+ title_list,
|
|
|
|
+ pos,
|
|
|
|
+ way,
|
|
|
|
+ plan_key=plan_key,
|
|
|
|
+ tag=tag,
|
|
|
|
+ debug=debug,
|
|
|
|
+ )
|