|
@@ -95,6 +95,77 @@ class CategoryRegressionV1:
|
|
|
|
|
|
return params, t_stats, p_values
|
|
return params, t_stats, p_values
|
|
|
|
|
|
|
|
+class CategoryLR:
|
|
|
|
+ def __init__(self):
|
|
|
|
+ self.features = [
|
|
|
|
+ 'CateOddities', 'CateFamily', 'CateHeartwarm',
|
|
|
|
+ 'CateHistory', 'CateHealth', 'CateLifeKnowledge', 'CateGossip',
|
|
|
|
+ 'CatePolitics', 'CateMilitary', 'CateMovie', 'CateSociety',
|
|
|
|
+ 'view_count_rate', 'bias'
|
|
|
|
+ ]
|
|
|
|
+
|
|
|
|
+ def preprocess_data(self, df):
|
|
|
|
+ for cate in category_name_map:
|
|
|
|
+ colname = category_name_map[cate]
|
|
|
|
+ df[colname] = df['category'] == cate
|
|
|
|
+ df[colname] = df[colname].astype(int)
|
|
|
|
+
|
|
|
|
+ df['ClassY'] = df['read_avg_rate'] > 1
|
|
|
|
+ df['bias'] = 1.0
|
|
|
|
+ return df
|
|
|
|
+
|
|
|
|
+ def build_and_print(self, df, account_name):
|
|
|
|
+ if account_name is not None:
|
|
|
|
+ sub_df = df[df['account_name'] == account_name]
|
|
|
|
+ else:
|
|
|
|
+ sub_df = df
|
|
|
|
+ sample_count = len(sub_df)
|
|
|
|
+ positive_count = len(sub_df.query('ClassY == 1'))
|
|
|
|
+ if sample_count < 10 or positive_count * (sample_count - positive_count) == 0:
|
|
|
|
+ return
|
|
|
|
+ sample_count = len(sub_df)
|
|
|
|
+ params, t_stats, p_values = self.run_logistic_regression(sub_df)
|
|
|
|
+ row = f'{account_name}\t{sample_count}'
|
|
|
|
+ for param, p_value in zip(params, p_values):
|
|
|
|
+ row += f'\t{param:.3f}'
|
|
|
|
+ print(row)
|
|
|
|
+
|
|
|
|
+ def build(self, df):
|
|
|
|
+ p_value_column_names = '\t'.join(self.features)
|
|
|
|
+ print('account\tsamples\t{}'.format(p_value_column_names))
|
|
|
|
+ # self.build_and_print(df, None)
|
|
|
|
+ for account_name in df['account_name'].unique():
|
|
|
|
+ self.build_and_print(df, account_name)
|
|
|
|
+
|
|
|
|
+ def get_param_names(self):
|
|
|
|
+ return ['bias'] + self.features
|
|
|
|
+
|
|
|
|
+ def run_logistic_regression(self, df):
|
|
|
|
+ X = df[self.features] # 特征列
|
|
|
|
+ y = df['ClassY'] # 目标变量
|
|
|
|
+
|
|
|
|
+ # 将数据集分为训练集和测试集
|
|
|
|
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
|
|
+
|
|
|
|
+ # 创建线性回归模型
|
|
|
|
+ logreg = LogisticRegression()
|
|
|
|
+
|
|
|
|
+ # 训练模型
|
|
|
|
+ logreg.fit(X_train, y_train)
|
|
|
|
+
|
|
|
|
+ # 预测测试集
|
|
|
|
+ y_pred = logreg.predict(X_test)
|
|
|
|
+
|
|
|
|
+ # 评估模型性能
|
|
|
|
+ accuracy = accuracy_score(y_test, y_pred)
|
|
|
|
+ conf_matrix = confusion_matrix(y_test, y_pred)
|
|
|
|
+ class_report = classification_report(y_test, y_pred)
|
|
|
|
+
|
|
|
|
+ # print(f"Accuracy: {accuracy}")
|
|
|
|
+ # print(f"Confusion Matrix: \n{conf_matrix}")
|
|
|
|
+ # print(f"Classification Report: \n{class_report}")
|
|
|
|
+ return logreg.coef_[0], None, [0] * len(logreg.coef_[0])
|
|
|
|
+
|
|
def main():
|
|
def main():
|
|
df = pd.read_excel('20241101_read_rate_samples.xlsx') # 如果数据来自CSV文件
|
|
df = pd.read_excel('20241101_read_rate_samples.xlsx') # 如果数据来自CSV文件
|
|
df['read_avg'] = df['阅读均值']
|
|
df['read_avg'] = df['阅读均值']
|
|
@@ -103,7 +174,7 @@ def main():
|
|
df['similarity'] = df['Similarity']
|
|
df['similarity'] = df['Similarity']
|
|
filter_condition = 'read_avg > 500 ' \
|
|
filter_condition = 'read_avg > 500 ' \
|
|
'and read_avg_rate > 0 and read_avg_rate < 3 ' \
|
|
'and read_avg_rate > 0 and read_avg_rate < 3 ' \
|
|
- 'and dt > 20240914 and similarity > 0'
|
|
|
|
|
|
+ 'and dt > 20240914 and similarity > 0'
|
|
df = df.query(filter_condition).copy()
|
|
df = df.query(filter_condition).copy()
|
|
|
|
|
|
m_cate = CategoryRegressionV1()
|
|
m_cate = CategoryRegressionV1()
|