|
|
18 hours ago | |
|---|---|---|
| app | 18 hours ago | |
| dev | 3 weeks ago | |
| .gitignore | 3 weeks ago | |
| Dockerfile | 3 months ago | |
| LICENSE | 1 year ago | |
| README.md | 3 weeks ago | |
| app_config.toml | 3 months ago | |
| docker-compose.yaml | 3 months ago | |
| jenkins_bash.sh | 3 months ago | |
| requirements.txt | 2 months ago | |
| task_app.py | 3 months ago |
description: a server for long_articles project experiments and tasks
hypercorn task_app:app --config app_config.toml
docker compose up -d
.
├── Dockerfile
├── LICENSE
├── README.md
├── app
│ ├── ab_test
│ │ ├── __init__.py
│ │ ├── ab_accounts.py
│ │ └── get_cover.py
│ ├── api
│ │ ├── middleware
│ │ │ ├── auth.py
│ │ │ ├── error_handler.py
│ │ │ └── rate_limiter.py
│ │ ├── service
│ │ │ ├── __init__.py
│ │ │ ├── daily_rank_manager.py
│ │ │ ├── gzh_cookie_manager.py
│ │ │ ├── task_manager_service.py
│ │ │ └── task_scheduler.py
│ │ └── v1
│ │ ├── endpoints
│ │ │ ├── __init__.py
│ │ │ ├── abtest.py
│ │ │ ├── health.py
│ │ │ ├── mcp.py
│ │ │ ├── monitor.py
│ │ │ ├── rank_log.py
│ │ │ ├── tasks.py
│ │ │ └── tokens.py
│ │ ├── routes
│ │ │ ├── __init__.py
│ │ │ └── routes.py
│ │ └── utils
│ │ ├── __init__.py
│ │ ├── _utils.py
│ │ ├── deps.py
│ │ └── schemas.py
│ ├── core
│ │ ├── bootstrap
│ │ │ ├── __init__.py
│ │ │ └── resource_manager.py
│ │ ├── config
│ │ │ ├── __init__.py
│ │ │ ├── cert
│ │ │ │ └── es_certs.crt
│ │ │ ├── global_settings.py
│ │ │ └── settings
│ │ │ ├── __init__.py
│ │ │ ├── aliyun.py
│ │ │ ├── apollo.py
│ │ │ ├── category.py
│ │ │ ├── cold_start.py
│ │ │ ├── deepseek.py
│ │ │ ├── elasticsearch.py
│ │ │ ├── mysql.py
│ │ │ ├── read_rate_limited.py
│ │ │ └── task_chinese_name.py
│ │ ├── database
│ │ │ ├── __init__.py
│ │ │ └── mysql_pools.py
│ │ ├── dependency
│ │ │ ├── __init__.py
│ │ │ └── dependencies.py
│ │ ├── observability
│ │ │ ├── __init__.py
│ │ │ ├── logging
│ │ │ │ ├── __init__.py
│ │ │ │ └── log_service.py
│ │ │ └── tracing
│ │ └── pipeline
│ │ ├── __init__.py
│ │ ├── crawler_pipeline.py
│ │ ├── data_recycle_pipeline.py
│ │ └── schemas.py
│ ├── domains
│ │ ├── algorithm_tasks
│ │ │ ├── __init__.py
│ │ │ ├── account_category_analysis.py
│ │ │ └── models.py
│ │ ├── analysis_task
│ │ │ ├── __init__.py
│ │ │ ├── account_position_info.py
│ │ │ ├── crawler_detail.py
│ │ │ └── rate_limited_article_filter
│ │ │ ├── __init__.py
│ │ │ ├── _mapper.py
│ │ │ ├── _utils.py
│ │ │ └── entrance.py
│ │ ├── cold_start_tasks
│ │ │ ├── __init__.py
│ │ │ ├── ad_platform_articles
│ │ │ │ ├── __init__.py
│ │ │ │ ├── _const.py
│ │ │ │ ├── _mapper.py
│ │ │ │ ├── _utils.py
│ │ │ │ └── entrance.py
│ │ │ ├── article_pool
│ │ │ │ ├── __init__.py
│ │ │ │ ├── article_pool_cold_start_const.py
│ │ │ │ ├── article_pool_cold_start_strategy.py
│ │ │ │ └── article_pool_filter_strategy.py
│ │ │ ├── article_pool_cold_start.py
│ │ │ ├── video_pool
│ │ │ │ ├── __init__.py
│ │ │ │ ├── video_pool_audit_strategy.py
│ │ │ │ └── video_pool_const.py
│ │ │ └── video_pool_cold_start.py
│ │ ├── crawler_tasks
│ │ │ ├── __init__.py
│ │ │ ├── crawler_account_manager.py
│ │ │ ├── crawler_gzh.py
│ │ │ ├── crawler_gzh_fans.py
│ │ │ └── crawler_toutiao.py
│ │ ├── data_recycle_tasks
│ │ │ ├── __init__.py
│ │ │ ├── article_detail_stat.py
│ │ │ ├── recycle_daily_publish_articles.py
│ │ │ ├── recycle_mini_program_detail.py
│ │ │ ├── recycle_mini_program_info
│ │ │ │ ├── __init__.py
│ │ │ │ ├── _const.py
│ │ │ │ ├── _mapper.py
│ │ │ │ ├── _util.py
│ │ │ │ └── entrance.py
│ │ │ └── recycle_outside_account_articles.py
│ │ ├── llm_tasks
│ │ │ ├── __init__.py
│ │ │ ├── aigc_decode_task
│ │ │ │ ├── __init__.py
│ │ │ │ ├── _const.py
│ │ │ │ ├── _mapper.py
│ │ │ │ ├── _utils.py
│ │ │ │ ├── create_decode_tasks.py
│ │ │ │ ├── extract_decode_task_detail.py
│ │ │ │ └── fetch_decode_results.py
│ │ │ ├── candidate_account_process.py
│ │ │ ├── process_title.py
│ │ │ └── prompts.py
│ │ ├── mcp
│ │ │ ├── __init__.py
│ │ │ ├── _const.py
│ │ │ ├── _handler_map.py
│ │ │ └── _mapper.py
│ │ ├── monitor_tasks
│ │ │ ├── __init__.py
│ │ │ ├── ad_platform_accounts_monitor
│ │ │ │ ├── __init__.py
│ │ │ │ ├── _const.py
│ │ │ │ ├── _mapper.py
│ │ │ │ ├── _utils.py
│ │ │ │ └── entrance.py
│ │ │ ├── auto_reply_cards_monitor
│ │ │ │ ├── __init__.py
│ │ │ │ ├── _const.py
│ │ │ │ ├── _mapper.py
│ │ │ │ ├── _utils.py
│ │ │ │ └── entrance.py
│ │ │ ├── cooperate_accounts_monitor.py
│ │ │ ├── fwh_group_publish_monitor.py
│ │ │ ├── get_off_videos.py
│ │ │ ├── gzh_article_monitor.py
│ │ │ ├── kimi_balance.py
│ │ │ ├── limited_account_analysis.py
│ │ │ ├── rank_log_monitor
│ │ │ │ ├── __init__.py
│ │ │ │ ├── _const.py
│ │ │ │ ├── _mapper.py
│ │ │ │ ├── _utils.py
│ │ │ │ └── entrance.py
│ │ │ └── task_processing_monitor.py
│ │ └── recommend
│ │ ├── __init__.py
│ │ ├── i2i_recommend
│ │ │ ├── __init__.py
│ │ │ ├── _const.py
│ │ │ ├── _mapper.py
│ │ │ ├── _utils.py
│ │ │ └── entrance.py
│ │ └── offline_recommend
│ │ ├── __init__.py
│ │ ├── core.py
│ │ ├── strategy
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── get_top_article.py
│ │ │ └── i2i.py
│ │ └── utils
│ │ ├── __init__.py
│ │ ├── produce_data.py
│ │ └── recommend_apollo.py
│ ├── infra
│ │ ├── crawler
│ │ │ ├── __init__.py
│ │ │ ├── tophub
│ │ │ ├── toutiao
│ │ │ │ ├── __init__.py
│ │ │ │ ├── blogger.py
│ │ │ │ ├── detail_recommend.py
│ │ │ │ ├── main_page_recomend.py
│ │ │ │ ├── search.py
│ │ │ │ ├── toutiao.js
│ │ │ │ └── use_js.py
│ │ │ └── wechat
│ │ │ ├── __init__.py
│ │ │ ├── gzh_article_stat.py
│ │ │ ├── gzh_fans.py
│ │ │ └── gzh_spider.py
│ │ ├── external
│ │ │ ├── __init__.py
│ │ │ ├── aliyun.py
│ │ │ ├── apollo.py
│ │ │ ├── deepseek_official.py
│ │ │ ├── elastic_search.py
│ │ │ ├── feishu.py
│ │ │ └── odps_service.py
│ │ ├── internal
│ │ │ ├── __init__.py
│ │ │ ├── aigc_system.py
│ │ │ ├── long_articles.py
│ │ │ ├── piaoquan.py
│ │ │ └── piaoquan_decode_server.py
│ │ ├── mapper
│ │ │ ├── __init__.py
│ │ │ ├── aigc_mapper.py
│ │ │ ├── long_article_mapper.py
│ │ │ ├── long_video_mapper.py
│ │ │ └── piaoquan_crawler_mapper.py
│ │ └── shared
│ │ ├── __init__.py
│ │ ├── async_tasks.py
│ │ ├── http_client.py
│ │ ├── image.py
│ │ ├── oss.py
│ │ ├── response.py
│ │ └── tools.py
│ ├── jobs
│ │ ├── domains
│ │ │ ├── __init__.py
│ │ │ ├── algorithm.py
│ │ │ ├── anaylsis.py
│ │ │ ├── cold_start.py
│ │ │ ├── crawler_tasks.py
│ │ │ ├── data_recycle.py
│ │ │ ├── llm_task.py
│ │ │ ├── monitor_task.py
│ │ │ └── recommend.py
│ │ ├── task_config.py
│ │ ├── task_handler.py
│ │ ├── task_mapper.py
│ │ └── task_utils.py
│ └── schemas
│ ├── __init__.py
│ └── image.py
├── app_config.toml
├── dev
│ ├── crontab_back.txt
├── docker-compose.yaml
├── docs
│ ├── i2i_recommend_data_sync.md
│ ├── rank_log_monitor_tech_plan.md
│ └── rate_limited_article_filter_tech_plan.md
├── jenkins_bash.sh
├── requirements.txt
└── task_app.py
tree -I "__pycache__|*.pyc"
以下为服务器上配置的定时任务(路径与主机以实际环境为准)。
0 9,15,21 * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "auto_follow_account"}'
26 * * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "rate_limited_article_filter"}'
30 * * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "get_follow_result"}'
50 * * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "extract_reply_result"}'
0 3 * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "crawler_gzh_articles", "account_method": "1030-手动挑号", "crawl_mode": "account", "strategy": "V1"}'
0 4 * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "crawler_gzh_articles", "account_method": "cooperate_account", "crawl_mode": "account", "strategy": "V1"}'
# 定时清理文件
0 1 * * * find /root/luojunhui/LongArticlesJob/static -type f -name "*.mp4" -mtime +5 -delete
# 每天 9 点, 18 点执行 gzh 视频抓取
0 9,18 * * * bash /root/luojunhui/LongArticlesJob/sh/run_gzh_video_crawler.sh
* * * * * bash /root/luojunhui/LongArticlesJob/sh/run_long_articles_job.sh
# 外部服务号监测
0 13 * * * curl -X POST http://127.0.0.1:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "cooperate_accounts_monitor"}'
30 * * * * curl -X POST http://127.0.0.1:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "cooperate_accounts_detail"}'
# 每天凌晨 4点,下午 4 点各执行一次头条视频抓取
0 4,16 * * * bash /root/luojunhui/LongArticlesJob/sh/run_toutiao_account_video_crawler.sh
# 更新服务号数据
0 10,17 * * * bash /root/luojunhui/LongArticlesJob/sh/run_fwh_data_manager.sh
# 每15分钟执行一次今日头条推荐流抓取
*/15 * * * * bash /root/luojunhui/LongArticlesJob/sh/run_toutiao_recommend.sh
# 每10分钟执行一次从aigc系统获取发布文章
*/10 * * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_article_info_from_aigc.sh
# 每10分钟执行一次标题相似度计算任务
*/10 * * * * bash /root/luojunhui/LongArticlesJob/sh/run_title_similarity_task.sh
# 每小时执行一次标题改写
0 * * * * bash /root/luojunhui/LongArticlesJob/sh/run_title_process_task.sh
# 凌晨2点30执行更新小程序信息任务
30 2 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_minigram_info_daily.sh
# 凌晨3:00,下午3:00执行视频号抓取任务
0 3,15 * * * bash /root/luojunhui/LongArticlesJob/sh/run_sph_video_crawler.sh
# 每天上午10点30执行文章退场 && 晋升任务
30 10 * * * bash /root/luojunhui/LongArticlesJob/sh/run_article_title_exit_v1.sh
# 晚上6点执行头条文章冷启动
0 18 * * * curl -X POST http://127.0.0.1:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "article_pool_cold_start", "platform": "toutiao", "crawler_methods": ["toutiao_account_association"]}'
# 17:50执行公众号文章战冷启动
30 17 * * * curl -X POST http://127.0.0.1:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "article_pool_cold_start", "strategy": "strategy_v3"}'
0 4 * * * curl -X POST http://127.0.0.1:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "article_pool_cold_start", "strategy": "strategy_v1"}'
# 早上执行sohu 抓取
0 6 * * * bash /root/luojunhui/LongArticlesJob/sh/run_schedule_app.sh
0 2 * * * curl -X POST http://127.0.0.1:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "crawler_toutiao"}'
#
0 14 * * * bash /root/luojunhui/LongArticlesJob/sh/run_cold_start_publish.sh
# 每日上午9点执行账号联想任务
0 9 * * * bash /root/luojunhui/LongArticlesJob/sh/run_account_association.sh
# 执行阅读率均值
0 10 * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "update_account_read_rate_avg"}'
# 执行阅读均值
40 10 * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "update_account_read_avg"}'
# 执行打开率均值
50 10 * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "update_account_open_rate_avg"}'
# 每天11点执行文章联想任务
0 11 * * * bash /root/luojunhui/LongArticlesJob/sh/run_article_association.sh
# 每小时执行一次校验视频状态
24 */4 * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "check_publish_video_audit_status"}'
# 每天凌晨4:30 15:30执行视频发布和审核流程
0 1,8,19 * * * bash /root/luojunhui/LongArticlesJob/sh/run_video_publish_and_audit.sh
30 8,15 * * * curl -X POST http://127.0.0.1:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "daily_publish_articles_recycle"}'
0 21 * * * curl -X POST http://127.0.0.1:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "daily_publish_articles_recycle"}'
10 22 * * * curl -X POST http://127.0.0.1:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "daily_publish_articles_recycle"}'
30 9,16,21 * * * curl -X POST http://127.0.0.1:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "update_root_source_id"}'
30 22 * * * curl -X POST http://127.0.0.1:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "update_root_source_id"}'
# 每天上午 9:30 点,下午 2 点,晚上 7 点执行下架视频任务
0 9,15,19 * * * curl -X POST http://127.0.0.1:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "get_off_videos"}'
# 执行内部文章违规检测
0 9,16,23 * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "inner_article_monitor"}'
# 每开始执行百度视频
20 0,12 * * * bash /root/luojunhui/LongArticlesJob/sh/run_baidu_video_crawler.sh
# check kimo balance hourly
# 每4h校验一次kimi余额
# 25 */4 * * * curl -X POST http://127.0.0.1:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "check_kimi_balance"}'
# 更新小程序信息
0 3,4,5,6,7,8 * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "mini_program_detail_process"}'
# */8 * * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "extract_title_features", "batch_size": 50, "version": 2}'
# 0 9,16 * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "recycle_outside_account_articles"}'
# 35 16 * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "update_outside_account_article_root_source_id"}'
# 执行限流文章分析
40 10 * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "update_limited_account_info"}'
# 早上 11 点获取前日文章详情
30 11 * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "article_detail_stat"}'
# 执行广告平台账号内容抓取
0 3 * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "ad_platform_accounts_crawler"}'
# 执广告平台账号详情抓取
32 * * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "ad_platform_article_detail"}'
# 执行创建解构任务
# 15 20 * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "create_ad_platform_accounts_decode_task"}'
# 定时获取解构结果
30 * * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "fetch_decode_result"}'
# 定时获取解构结果
50 * * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "extract_decode_result"}'
# 腾讯广告互相平台账号--发文至头条
0 6 * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "ad_platform_article_publish"}'
# 排序日志更新
0 14 * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "rank_log_monitor"}'
# I2I 排序日志更新
0 5 * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "i2i_recommend_data_sync"}'
# 候选账号质量分析
# 0 5,10,15,20 * * * curl -X POST http://192.168.142.66:6060/api/run_task -H "Content-Type: application/json" -d '{"task_name": "candidate_account_quality_analysis"}'