item.py 3.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. """
  2. @author: luojunhui
  3. """
  4. import datetime
  5. from pydantic import BaseModel, Field
  6. from typing import Optional
  7. class CrawlerMetaArticle(BaseModel):
  8. platform: str = Field(default=..., description="抓取平台")
  9. mode: str = Field(default=..., description="抓取模式")
  10. category: str = Field(
  11. default=..., description="抓取类型:最初设计不合理,积重难返,实际与品类无关"
  12. )
  13. out_account_id: str = Field(default=..., description="抓取账号账号id")
  14. article_index: str = Field(
  15. default=None, description="群发发文位置,常见于微信公众号"
  16. )
  17. title: str = Field(default=..., description="文章标题")
  18. link: str = Field(default=..., description="文章链接")
  19. read_cnt: int = Field(default=0, description="阅读量")
  20. like_cnt: int = Field(default=0, description="点赞量")
  21. description: Optional[str] = Field(
  22. default=None, max_length=255, description="文章简介"
  23. )
  24. publish_time: int = Field(default=None, description="文章发布时间")
  25. crawler_time: int = Field(default=None, description="抓取时间")
  26. score: float = Field(default=None, description="相似度分")
  27. status: int = Field(default=1, description="文章状态")
  28. unique_index: str = Field(default=..., description="文章唯一index")
  29. source_article_title: str = Field(default=None, description="文章联想的种子文章")
  30. source_account: str = Field(default=None, description="账号联想种子账号")
  31. title_sensitivity: int = Field(default=0, description="标题是否敏感")
  32. category_status: int = Field(
  33. default=0,
  34. description="品类处理状态 0: init; 1: processing; 2: successfully; 99: fail",
  35. )
  36. has_video: int = Field(
  37. default=0,
  38. description="文章内嵌套视频状态 0: init; 1: processing; 2: successfully; 3:article link bad ;99: fail",
  39. )
  40. class CrawlerMetaAccount(BaseModel):
  41. account_name: str = Field(..., description="账号名称", min_length=1)
  42. account_id: str = Field(..., description="账号id", min_length=1)
  43. title_list: str = Field(default=None, description="账号主页第一页标题list")
  44. score_list: str = Field(
  45. default=None, description="账号主页第一页标题list契合得分(By LLM)"
  46. )
  47. avg_score: float = Field(default=None, description="score_list 的平均分")
  48. status: int = Field(
  49. default=0,
  50. description="分析状态,0: init, 1: processing, 2: successfully, 99: fail",
  51. )
  52. platform: str = Field(default=None, description="账号来源于哪个外部平台")
  53. crawler_date: datetime.date = Field(
  54. default=None, description="账号抓取日期,格式为“YYYY-MM-DD”"
  55. )
  56. using_status: int = Field(
  57. default=0,
  58. description="账号状态, 0: init, 1: processing, 2: successfully, 99: fail",
  59. )
  60. category_status: int = Field(
  61. default=0,
  62. description="账号品类处理状态, 0: init, 1: processing, 2: successfully, 99: fail",
  63. )
  64. category: str = Field(default=None, description="账号的品类")
  65. media_type: int = Field(default=2, description="账号抓取模态 1: 文章 2:视频")