|
@@ -14,6 +14,7 @@ from config import apolloConfig
|
|
|
|
|
|
apollo = apolloConfig()
|
|
apollo = apolloConfig()
|
|
DAILY_CRAWLER_MAX_NUM = 1000
|
|
DAILY_CRAWLER_MAX_NUM = 1000
|
|
|
|
+SIMILARITY_MIN_SCORE = 0.4
|
|
|
|
|
|
|
|
|
|
class CategoryColdStartTask(object):
|
|
class CategoryColdStartTask(object):
|
|
@@ -101,7 +102,8 @@ class CategoryColdStartTask(object):
|
|
}
|
|
}
|
|
)
|
|
)
|
|
article_df = DataFrame(article_list,
|
|
article_df = DataFrame(article_list,
|
|
- columns=['article_id', 'gh_id', 'position', 'title', 'link', 'read_cnt', 'status', 'llm_sensitivity', 'score'])
|
|
|
|
|
|
+ columns=['article_id', 'gh_id', 'position', 'title', 'link', 'read_cnt', 'status',
|
|
|
|
+ 'llm_sensitivity', 'score'])
|
|
return article_df
|
|
return article_df
|
|
|
|
|
|
def filter_each_category(self, category):
|
|
def filter_each_category(self, category):
|
|
@@ -230,7 +232,7 @@ class CategoryColdStartTask(object):
|
|
length_level5 = filter_df.shape[0]
|
|
length_level5 = filter_df.shape[0]
|
|
|
|
|
|
# 第六层通过相关性分数过滤
|
|
# 第六层通过相关性分数过滤
|
|
- filter_df = filter_df[filter_df['score'] > 0.4]
|
|
|
|
|
|
+ filter_df = filter_df[filter_df['score'] > SIMILARITY_MIN_SCORE]
|
|
length_level6 = filter_df.shape[0]
|
|
length_level6 = filter_df.shape[0]
|
|
|
|
|
|
log(
|
|
log(
|
|
@@ -396,4 +398,3 @@ class CategoryColdStartTask(object):
|
|
"traceback": traceback.format_exc()
|
|
"traceback": traceback.format_exc()
|
|
}
|
|
}
|
|
)
|
|
)
|
|
-
|
|
|