root il y a 2 ans
Parent
commit
12bd80b025
46 fichiers modifiés avec 7839 ajouts et 0 suppressions
  1. 3885 0
      all_stopword.txt
  2. 81 0
      calCtr.py
  3. 81 0
      calCtr1days.py
  4. 81 0
      calCtr3days.py
  5. 78 0
      calCtr7days.py
  6. 82 0
      calHourCtr.py
  7. 81 0
      calHourData.py
  8. 142 0
      calI2I.py
  9. 176 0
      calI2I2.py
  10. 124 0
      calI2I3.py
  11. 32 0
      clean.sh
  12. 62 0
      compose_score.py
  13. 97 0
      compose_score_3day.py
  14. 361 0
      config.py
  15. 47 0
      cut_title.py
  16. 47 0
      cut_title_top.py
  17. 337 0
      db_help.py
  18. 53 0
      export_3_day.py
  19. 53 0
      export_7_day.py
  20. 53 0
      export_hour_vid.py
  21. 53 0
      export_vid.py
  22. 54 0
      extract_cur_share_log.py
  23. 54 0
      extract_share_log.py
  24. 78 0
      extract_title_tag.py
  25. 54 0
      extract_user_action.py
  26. 53 0
      extract_video_info.py
  27. 46 0
      filter_video.py
  28. 95 0
      get3HotRecall.py
  29. 94 0
      get7HotRecall.py
  30. 83 0
      get_batch_sim_k.py
  31. 59 0
      get_sim_k.py
  32. 30 0
      import_redist.py
  33. 56 0
      predict.py
  34. 53 0
      process_video.py
  35. 68 0
      run.sh
  36. 95 0
      run_3day.sh
  37. 62 0
      run_ctr.sh
  38. 32 0
      run_extract_tag.sh
  39. 65 0
      run_hour.sh
  40. 36 0
      sendmsg.sh
  41. 30 0
      test.py
  42. 35 0
      test.sh
  43. 11 0
      test_faiss.py
  44. 12 0
      train_vec.sh
  45. 550 0
      utils.py
  46. 28 0
      word2vec.py

+ 3885 - 0
all_stopword.txt

@@ -0,0 +1,3885 @@
+--
+?
+“
+”
+》
+--
+able
+about
+above
+according
+accordingly
+across
+actually
+after
+afterwards
+again
+against
+ain't
+all
+allow
+allows
+almost
+alone
+along
+already
+also
+although
+always
+am
+among
+amongst
+an
+and
+another
+any
+anybody
+anyhow
+anyone
+anything
+anyway
+anyways
+anywhere
+apart
+appear
+appreciate
+appropriate
+are
+aren't
+around
+as
+a's
+aside
+ask
+asking
+associated
+at
+available
+away
+awfully
+be
+became
+because
+become
+becomes
+becoming
+been
+before
+beforehand
+behind
+being
+believe
+below
+beside
+besides
+best
+better
+between
+beyond
+both
+brief
+but
+by
+came
+can
+cannot
+cant
+can't
+cause
+causes
+certain
+certainly
+changes
+clearly
+c'mon
+co
+com
+come
+comes
+concerning
+consequently
+consider
+considering
+contain
+containing
+contains
+corresponding
+could
+couldn't
+course
+c's
+currently
+definitely
+described
+despite
+did
+didn't
+different
+do
+does
+doesn't
+doing
+done
+don't
+down
+downwards
+during
+each
+edu
+eg
+eight
+either
+else
+elsewhere
+enough
+entirely
+especially
+et
+etc
+even
+ever
+every
+everybody
+everyone
+everything
+everywhere
+ex
+exactly
+example
+except
+far
+few
+fifth
+first
+five
+followed
+following
+follows
+for
+former
+formerly
+forth
+four
+from
+further
+furthermore
+get
+gets
+getting
+given
+gives
+go
+goes
+going
+gone
+got
+gotten
+greetings
+had
+hadn't
+happens
+hardly
+has
+hasn't
+have
+haven't
+having
+he
+hello
+help
+hence
+her
+here
+hereafter
+hereby
+herein
+here's
+hereupon
+hers
+herself
+he's
+hi
+him
+himself
+his
+hither
+hopefully
+how
+howbeit
+however
+i'd
+ie
+if
+ignored
+i'll
+i'm
+immediate
+in
+inasmuch
+inc
+indeed
+indicate
+indicated
+indicates
+inner
+insofar
+instead
+into
+inward
+is
+isn't
+it
+it'd
+it'll
+its
+it's
+itself
+i've
+just
+keep
+keeps
+kept
+know
+known
+knows
+last
+lately
+later
+latter
+latterly
+least
+less
+lest
+let
+let's
+like
+liked
+likely
+little
+look
+looking
+looks
+ltd
+mainly
+many
+may
+maybe
+me
+mean
+meanwhile
+merely
+might
+more
+moreover
+most
+mostly
+much
+must
+my
+myself
+name
+namely
+nd
+near
+nearly
+necessary
+need
+needs
+neither
+never
+nevertheless
+new
+next
+nine
+no
+nobody
+non
+none
+noone
+nor
+normally
+not
+nothing
+novel
+now
+nowhere
+obviously
+of
+off
+often
+oh
+ok
+okay
+old
+on
+once
+one
+ones
+only
+onto
+or
+other
+others
+otherwise
+ought
+our
+ours
+ourselves
+out
+outside
+over
+overall
+own
+particular
+particularly
+per
+perhaps
+placed
+please
+plus
+possible
+presumably
+probably
+provides
+que
+quite
+qv
+rather
+rd
+re
+really
+reasonably
+regarding
+regardless
+regards
+relatively
+respectively
+right
+said
+same
+saw
+say
+saying
+says
+second
+secondly
+see
+seeing
+seem
+seemed
+seeming
+seems
+seen
+self
+selves
+sensible
+sent
+serious
+seriously
+seven
+several
+shall
+she
+should
+shouldn't
+since
+six
+so
+some
+somebody
+somehow
+someone
+something
+sometime
+sometimes
+somewhat
+somewhere
+soon
+sorry
+specified
+specify
+specifying
+still
+sub
+such
+sup
+sure
+take
+taken
+tell
+tends
+th
+than
+thank
+thanks
+thanx
+that
+thats
+that's
+the
+their
+theirs
+them
+themselves
+then
+thence
+there
+thereafter
+thereby
+therefore
+therein
+theres
+there's
+thereupon
+these
+they
+they'd
+they'll
+they're
+they've
+think
+third
+this
+thorough
+thoroughly
+those
+though
+three
+through
+throughout
+thru
+thus
+to
+together
+too
+took
+toward
+towards
+tried
+tries
+truly
+try
+trying
+t's
+twice
+two
+un
+under
+unfortunately
+unless
+unlikely
+until
+unto
+up
+upon
+us
+use
+used
+useful
+uses
+using
+usually
+value
+various
+very
+via
+viz
+vs
+want
+wants
+was
+wasn't
+way
+we
+we'd
+welcome
+well
+we'll
+went
+were
+we're
+weren't
+we've
+what
+whatever
+what's
+when
+whence
+whenever
+where
+whereafter
+whereas
+whereby
+wherein
+where's
+whereupon
+wherever
+whether
+which
+while
+whither
+who
+whoever
+whole
+whom
+who's
+whose
+why
+will
+willing
+wish
+with
+within
+without
+wonder
+won't
+would
+wouldn't
+yes
+yet
+you
+you'd
+you'll
+your
+you're
+yours
+yourself
+yourselves
+you've
+zero
+zt
+ZT
+zz
+ZZ
+一
+一下
+一些
+一切
+一则
+一天
+一定
+一方面
+一旦
+一时
+一来
+一样
+一次
+一片
+一直
+一致
+一般
+一起
+一边
+一面
+万一
+上下
+上升
+上去
+上来
+上述
+上面
+下列
+下去
+下来
+下面
+不一
+不久
+不仅
+不会
+不但
+不光
+不单
+不变
+不只
+不可
+不同
+不够
+不如
+不得
+不怕
+不惟
+不成
+不拘
+不敢
+不断
+不是
+不比
+不然
+不特
+不独
+不管
+不能
+不要
+不论
+不足
+不过
+不问
+与
+与其
+与否
+与此同时
+专门
+且
+两者
+严格
+严重
+个
+个人
+个别
+中小
+中间
+丰富
+临
+为
+为主
+为了
+为什么
+为什麽
+为何
+为着
+主张
+主要
+举行
+乃
+乃至
+么
+之
+之一
+之前
+之后
+之後
+之所以
+之类
+乌乎
+乎
+乘
+也
+也好
+也是
+也罢
+了
+了解
+争取
+于
+于是
+于是乎
+云云
+互相
+产生
+人们
+人家
+什么
+什么样
+什麽
+今后
+今天
+今年
+今後
+仍然
+从
+从事
+从而
+他
+他人
+他们
+他的
+代替
+以
+以上
+以下
+以为
+以便
+以免
+以前
+以及
+以后
+以外
+以後
+以来
+以至
+以至于
+以致
+们
+任
+任何
+任凭
+任务
+企图
+伟大
+似乎
+似的
+但
+但是
+何
+何况
+何处
+何时
+作为
+你
+你们
+你的
+使得
+使用
+例如
+依
+依照
+依靠
+促进
+保持
+俺
+俺们
+倘
+倘使
+倘或
+倘然
+倘若
+假使
+假如
+假若
+做到
+像
+允许
+充分
+先后
+先後
+先生
+全部
+全面
+兮
+共同
+关于
+其
+其一
+其中
+其二
+其他
+其余
+其它
+其实
+其次
+具体
+具体地说
+具体说来
+具有
+再者
+再说
+冒
+冲
+决定
+况且
+准备
+几
+几乎
+几时
+凭
+凭借
+出去
+出来
+出现
+分别
+则
+别
+别的
+别说
+到
+前后
+前者
+前进
+前面
+加之
+加以
+加入
+加强
+十分
+即
+即令
+即使
+即便
+即或
+即若
+却不
+原来
+又
+及
+及其
+及时
+及至
+双方
+反之
+反应
+反映
+反过来
+反过来说
+取得
+受到
+变成
+另
+另一方面
+另外
+只是
+只有
+只要
+只限
+叫
+叫做
+召开
+叮咚
+可
+可以
+可是
+可能
+可见
+各
+各个
+各人
+各位
+各地
+各种
+各级
+各自
+合理
+同
+同一
+同时
+同样
+后来
+后面
+向
+向着
+吓
+吗
+否则
+吧
+吧哒
+吱
+呀
+呃
+呕
+呗
+呜
+呜呼
+呢
+周围
+呵
+呸
+呼哧
+咋
+和
+咚
+咦
+咱
+咱们
+咳
+哇
+哈
+哈哈
+哉
+哎
+哎呀
+哎哟
+哗
+哟
+哦
+哩
+哪
+哪个
+哪些
+哪儿
+哪天
+哪年
+哪怕
+哪样
+哪边
+哪里
+哼
+哼唷
+唉
+啊
+啐
+啥
+啦
+啪达
+喂
+喏
+喔唷
+嗡嗡
+嗬
+嗯
+嗳
+嘎
+嘎登
+嘘
+嘛
+嘻
+嘿
+因
+因为
+因此
+因而
+固然
+在
+在下
+地
+坚决
+坚持
+基本
+处理
+复杂
+多
+多少
+多数
+多次
+大力
+大多数
+大大
+大家
+大批
+大约
+大量
+失去
+她
+她们
+她的
+好的
+好象
+如
+如上所述
+如下
+如何
+如其
+如果
+如此
+如若
+存在
+宁
+宁可
+宁愿
+宁肯
+它
+它们
+它们的
+它的
+安全
+完全
+完成
+实现
+实际
+宣布
+容易
+密切
+对
+对于
+对应
+将
+少数
+尔后
+尚且
+尤其
+就
+就是
+就是说
+尽
+尽管
+属于
+岂但
+左右
+巨大
+巩固
+己
+已经
+帮助
+常常
+并
+并不
+并不是
+并且
+并没有
+广大
+广泛
+应当
+应用
+应该
+开外
+开始
+开展
+引起
+强烈
+强调
+归
+当
+当前
+当时
+当然
+当着
+形成
+彻底
+彼
+彼此
+往
+往往
+待
+後来
+後面
+得
+得出
+得到
+心里
+必然
+必要
+必须
+怎
+怎么
+怎么办
+怎么样
+怎样
+怎麽
+总之
+总是
+总的来看
+总的来说
+总的说来
+总结
+总而言之
+恰恰相反
+您
+意思
+愿意
+慢说
+成为
+我
+我们
+我的
+或
+或是
+或者
+战斗
+所
+所以
+所有
+所谓
+打
+扩大
+把
+抑或
+拿
+按
+按照
+换句话说
+换言之
+据
+掌握
+接着
+接著
+故
+故此
+整个
+方便
+方面
+旁人
+无宁
+无法
+无论
+既
+既是
+既然
+时候
+明显
+明确
+是
+是否
+是的
+显然
+显著
+普通
+普遍
+更加
+曾经
+替
+最后
+最大
+最好
+最後
+最近
+最高
+有
+有些
+有关
+有利
+有力
+有所
+有效
+有时
+有点
+有的
+有着
+有著
+望
+朝
+朝着
+本
+本着
+来
+来着
+极了
+构成
+果然
+果真
+某
+某个
+某些
+根据
+根本
+欢迎
+正在
+正如
+正常
+此
+此外
+此时
+此间
+毋宁
+每
+每个
+每天
+每年
+每当
+比
+比如
+比方
+比较
+毫不
+没有
+沿
+沿着
+注意
+深入
+清楚
+满足
+漫说
+焉
+然则
+然后
+然後
+然而
+照
+照着
+特别是
+特殊
+特点
+现代
+现在
+甚么
+甚而
+甚至
+用
+由
+由于
+由此可见
+的
+的话
+目前
+直到
+直接
+相似
+相信
+相反
+相同
+相对
+相对而言
+相应
+相当
+相等
+省得
+看出
+看到
+看来
+看看
+看见
+真是
+真正
+着
+着呢
+矣
+知道
+确定
+离
+积极
+移动
+突出
+突然
+立即
+第
+等
+等等
+管
+紧接着
+纵
+纵令
+纵使
+纵然
+练习
+组成
+经
+经常
+经过
+结合
+结果
+给
+绝对
+继续
+继而
+维持
+综上所述
+罢了
+考虑
+者
+而
+而且
+而况
+而外
+而已
+而是
+而言
+联系
+能
+能否
+能够
+腾
+自
+自个儿
+自从
+自各儿
+自家
+自己
+自身
+至
+至于
+良好
+若
+若是
+若非
+范围
+莫若
+获得
+虽
+虽则
+虽然
+虽说
+行为
+行动
+表明
+表示
+被
+要
+要不
+要不是
+要不然
+要么
+要是
+要求
+规定
+觉得
+认为
+认真
+认识
+让
+许多
+论
+设使
+设若
+该
+说明
+诸位
+谁
+谁知
+赶
+起
+起来
+起见
+趁
+趁着
+越是
+跟
+转动
+转变
+转贴
+较
+较之
+边
+达到
+迅速
+过
+过去
+过来
+运用
+还是
+还有
+这
+这个
+这么
+这么些
+这么样
+这么点儿
+这些
+这会儿
+这儿
+这就是说
+这时
+这样
+这点
+这种
+这边
+这里
+这麽
+进入
+进步
+进而
+进行
+连
+连同
+适应
+适当
+适用
+逐步
+逐渐
+通常
+通过
+造成
+遇到
+遭到
+避免
+那
+那个
+那么
+那么些
+那么样
+那些
+那会儿
+那儿
+那时
+那样
+那边
+那里
+那麽
+部分
+鄙人
+采取
+里面
+重大
+重新
+重要
+鉴于
+问题
+防止
+阿
+附近
+限制
+除
+除了
+除此之外
+除非
+随
+随着
+随著
+集中
+需要
+非但
+非常
+非徒
+靠
+顺
+顺着
+首先
+高兴
+是不是
+说说
+ 
+$
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+?
+_
+“
+”
+、
+。
+《
+》
+一
+一些
+一何
+一切
+一则
+一方面
+一旦
+一来
+一样
+一般
+一转眼
+万一
+上
+上下
+下
+不
+不仅
+不但
+不光
+不单
+不只
+不外乎
+不如
+不妨
+不尽
+不尽然
+不得
+不怕
+不惟
+不成
+不拘
+不料
+不是
+不比
+不然
+不特
+不独
+不管
+不至于
+不若
+不论
+不过
+不问
+与
+与其
+与其说
+与否
+与此同时
+且
+且不说
+且说
+两者
+个
+个别
+临
+为
+为了
+为什么
+为何
+为止
+为此
+为着
+乃
+乃至
+乃至于
+么
+之
+之一
+之所以
+之类
+乌乎
+乎
+乘
+也
+也好
+也罢
+了
+二来
+于
+于是
+于是乎
+云云
+云尔
+些
+亦
+人
+人们
+人家
+什么
+什么样
+今
+介于
+仍
+仍旧
+从
+从此
+从而
+他
+他人
+他们
+以
+以上
+以为
+以便
+以免
+以及
+以故
+以期
+以来
+以至
+以至于
+以致
+们
+任
+任何
+任凭
+似的
+但
+但凡
+但是
+何
+何以
+何况
+何处
+何时
+余外
+作为
+你
+你们
+使
+使得
+例如
+依
+依据
+依照
+便于
+俺
+俺们
+倘
+倘使
+倘或
+倘然
+倘若
+借
+假使
+假如
+假若
+傥然
+像
+儿
+先不先
+光是
+全体
+全部
+兮
+关于
+其
+其一
+其中
+其二
+其他
+其余
+其它
+其次
+具体地说
+具体说来
+兼之
+内
+再
+再其次
+再则
+再有
+再者
+再者说
+再说
+冒
+冲
+况且
+几
+几时
+凡
+凡是
+凭
+凭借
+出于
+出来
+分别
+则
+则甚
+别
+别人
+别处
+别是
+别的
+别管
+别说
+到
+前后
+前此
+前者
+加之
+加以
+即
+即令
+即使
+即便
+即如
+即或
+即若
+却
+去
+又
+又及
+及
+及其
+及至
+反之
+反而
+反过来
+反过来说
+受到
+另
+另一方面
+另外
+另悉
+只
+只当
+只怕
+只是
+只有
+只消
+只要
+只限
+叫
+叮咚
+可
+可以
+可是
+可见
+各
+各个
+各位
+各种
+各自
+同
+同时
+后
+后者
+向
+向使
+向着
+吓
+吗
+否则
+吧
+吧哒
+吱
+呀
+呃
+呕
+呗
+呜
+呜呼
+呢
+呵
+呵呵
+呸
+呼哧
+咋
+和
+咚
+咦
+咧
+咱
+咱们
+咳
+哇
+哈
+哈哈
+哉
+哎
+哎呀
+哎哟
+哗
+哟
+哦
+哩
+哪
+哪个
+哪些
+哪儿
+哪天
+哪年
+哪怕
+哪样
+哪边
+哪里
+哼
+哼唷
+唉
+唯有
+啊
+啐
+啥
+啦
+啪达
+啷当
+喂
+喏
+喔唷
+喽
+嗡
+嗡嗡
+嗬
+嗯
+嗳
+嘎
+嘎登
+嘘
+嘛
+嘻
+嘿
+嘿嘿
+因
+因为
+因了
+因此
+因着
+因而
+固然
+在
+在下
+在于
+地
+基于
+处在
+多
+多么
+多少
+大
+大家
+她
+她们
+好
+如
+如上
+如上所述
+如下
+如何
+如其
+如同
+如是
+如果
+如此
+如若
+始而
+孰料
+孰知
+宁
+宁可
+宁愿
+宁肯
+它
+它们
+对
+对于
+对待
+对方
+对比
+将
+小
+尔
+尔后
+尔尔
+尚且
+就
+就是
+就是了
+就是说
+就算
+就要
+尽
+尽管
+尽管如此
+岂但
+己
+已
+已矣
+巴
+巴巴
+并
+并且
+并非
+庶乎
+庶几
+开外
+开始
+归
+归齐
+当
+当地
+当然
+当着
+彼
+彼时
+彼此
+往
+待
+很
+得
+得了
+怎
+怎么
+怎么办
+怎么样
+怎奈
+怎样
+总之
+总的来看
+总的来说
+总的说来
+总而言之
+恰恰相反
+您
+惟其
+慢说
+我
+我们
+或
+或则
+或是
+或曰
+或者
+截至
+所
+所以
+所在
+所幸
+所有
+才
+才能
+打
+打从
+把
+抑或
+拿
+按
+按照
+换句话说
+换言之
+据
+据此
+接着
+故
+故此
+故而
+旁人
+无
+无宁
+无论
+既
+既往
+既是
+既然
+时候
+是
+是以
+是的
+曾
+替
+替代
+最
+有
+有些
+有关
+有及
+有时
+有的
+望
+朝
+朝着
+本
+本人
+本地
+本着
+本身
+来
+来着
+来自
+来说
+极了
+果然
+果真
+某
+某个
+某些
+某某
+根据
+欤
+正值
+正如
+正巧
+正是
+此
+此地
+此处
+此外
+此时
+此次
+此间
+毋宁
+每
+每当
+比
+比及
+比如
+比方
+没奈何
+沿
+沿着
+漫说
+焉
+然则
+然后
+然而
+照
+照着
+犹且
+犹自
+甚且
+甚么
+甚或
+甚而
+甚至
+甚至于
+用
+用来
+由
+由于
+由是
+由此
+由此可见
+的
+的确
+的话
+直到
+相对而言
+省得
+看
+眨眼
+着
+着呢
+矣
+矣乎
+矣哉
+离
+竟而
+第
+等
+等到
+等等
+简言之
+管
+类如
+紧接着
+纵
+纵令
+纵使
+纵然
+经
+经过
+结果
+给
+继之
+继后
+继而
+综上所述
+罢了
+者
+而
+而且
+而况
+而后
+而外
+而已
+而是
+而言
+能
+能否
+腾
+自
+自个儿
+自从
+自各儿
+自后
+自家
+自己
+自打
+自身
+至
+至于
+至今
+至若
+致
+般的
+若
+若夫
+若是
+若果 
+若非
+莫不然
+莫如
+莫若
+虽
+虽则
+虽然
+虽说
+被
+要
+要不
+要不是
+要不然
+要么
+要是
+譬喻
+譬如
+让
+许多
+论
+设使
+设或
+设若
+诚如
+诚然
+该
+说来
+诸
+诸位
+诸如
+谁
+谁人
+谁料
+谁知
+贼死
+赖以
+赶
+起
+起见
+趁
+趁着
+越是
+距
+跟
+较
+较之
+边
+过
+还
+还是
+还有
+还要
+这
+这一来
+这个
+这么
+这么些
+这么样
+这么点儿
+这些
+这会儿
+这儿
+这就是说
+这时
+这样
+这次
+这般
+这边
+这里
+进而
+连
+连同
+逐步
+通过
+遵循
+遵照
+那
+那个
+那么
+那么些
+那么样
+那些
+那会儿
+那儿
+那时
+那样
+那般
+那边
+那里
+都
+鄙人
+鉴于
+针对
+阿
+除
+除了
+除外
+除开
+除此之外
+除非
+随
+随后
+随时
+随着
+难道说
+非但
+非徒
+非特
+非独
+靠
+顺
+顺着
+首先
+!
+,
+:
+;
+?
+———
+》),
+)÷(1-
+”,
+)、
+=(
+:
+→
+℃ 
+&
+*
+一一
+~~~~
+’
+. 
+『
+.一
+./
+-- 
+』
+=″
+【
+[*]
+}>
+[⑤]]
+[①D]
+c]
+ng昉
+*
+//
+[
+]
+[②e]
+[②g]
+={
+}
+,也 
+‘
+A
+[①⑥]
+[②B] 
+[①a]
+[④a]
+[①③]
+[③h]
+③]
+1. 
+-- 
+[②b]
+’‘ 
+××× 
+[①⑧]
+0:2 
+=[
+[⑤b]
+[②c] 
+[④b]
+[②③]
+[③a]
+[④c]
+[①⑤]
+[①⑦]
+[①g]
+∈[ 
+[①⑨]
+[①④]
+[①c]
+[②f]
+[②⑧]
+[②①]
+[①C]
+[③c]
+[③g]
+[②⑤]
+[②②]
+一.
+[①h]
+.数
+[]
+[①B]
+数/
+[①i]
+[③e]
+[①①]
+[④d]
+[④e]
+[③b]
+[⑤a]
+[①A]
+[②⑧]
+[②⑦]
+[①d]
+[②j]
+〕〔
+][
+://
+′∈
+[②④
+[⑤e]
+12%
+b]
+...
+...................
+…………………………………………………③
+ZXFITL
+[③F]
+」
+[①o]
+]∧′=[ 
+∪φ∈
+′|
+{-
+②c
+}
+[③①]
+R.L.
+[①E]
+-[*]-
+↑
+.日 
+[②d]
+[②
+[②⑦]
+[②②]
+[③e]
+[①i]
+[①B]
+[①h]
+[①d]
+[①g]
+[①②]
+[②a]
+f]
+[⑩]
+a]
+[①e]
+[②h]
+[②⑥]
+[③d]
+[②⑩]
+e]
+〉
+】
+元/吨
+[②⑩]
+2.3%
+5:0  
+[①]
+::
+[②]
+[③]
+[④]
+[⑤]
+[⑥]
+[⑦]
+[⑧]
+[⑨] 
+……
+——
+?
+、
+。
+“
+”
+《
+》
+!
+,
+:
+;
+?
+.
+,
+.
+'
+? 
+———
+──
+? 
+—
+<
+>
+(
+)
+〔
+〕
+[
+]
+(
+)
+-
++
+~
+/
+/
+①
+②
+③
+④
+⑤
+⑥
+⑦
+⑧
+⑨
+⑩
+Ⅲ
+"
+;
+#
+@
+φ.
+■
+▲
+sub
+exp 
+sup
+sub
+Lex 
+#
+%
+&
+'
++
++ξ
+++
+-
+-β
+<
+<±
+<Δ
+<λ
+<φ
+<<
+=
+=
+=☆
+=-
+>
+>λ
+_
+~±
+~+
+[⑤f]
+[⑤d]
+[②i]
+≈ 
+[②G]
+[①f]
+LI
+㈧ 
+[-
+......
+〉
+[③⑩]
+第二
+一番
+一直
+一个
+一些
+许多
+种
+有的是
+也就是说
+末##末
+啊
+阿
+哎
+哎呀
+哎哟
+唉
+俺
+俺们
+按
+按照
+吧
+吧哒
+把
+罢了
+被
+本
+本着
+比
+比方
+比如
+鄙人
+彼
+彼此
+边
+别
+别的
+别说
+并
+并且
+不比
+不成
+不单
+不但
+不独
+不管
+不光
+不过
+不仅
+不拘
+不论
+不怕
+不然
+不如
+不特
+不惟
+不问
+不只
+朝
+朝着
+趁
+趁着
+乘
+冲
+除
+除此之外
+除非
+除了
+此
+此间
+此外
+从
+从而
+打
+待
+但
+但是
+当
+当着
+到
+得
+的
+的话
+等
+等等
+地
+第
+叮咚
+对
+对于
+多
+多少
+而
+而况
+而且
+而是
+而外
+而言
+而已
+尔后
+反过来
+反过来说
+反之
+非但
+非徒
+否则
+嘎
+嘎登
+该
+赶
+个
+各
+各个
+各位
+各种
+各自
+给
+根据
+跟
+故
+故此
+固然
+关于
+管
+归
+果然
+果真
+过
+哈
+哈哈
+呵
+和
+何
+何处
+何况
+何时
+嘿
+哼
+哼唷
+呼哧
+乎
+哗
+还是
+还有
+换句话说
+换言之
+或
+或是
+或者
+极了
+及
+及其
+及至
+即
+即便
+即或
+即令
+即若
+即使
+几
+几时
+己
+既
+既然
+既是
+继而
+加之
+假如
+假若
+假使
+鉴于
+将
+较
+较之
+叫
+接着
+结果
+借
+紧接着
+进而
+尽
+尽管
+经
+经过
+就
+就是
+就是说
+据
+具体地说
+具体说来
+开始
+开外
+靠
+咳
+可
+可见
+可是
+可以
+况且
+啦
+来
+来着
+离
+例如
+哩
+连
+连同
+两者
+了
+临
+另
+另外
+另一方面
+论
+嘛
+吗
+慢说
+漫说
+冒
+么
+每
+每当
+们
+莫若
+某
+某个
+某些
+拿
+哪
+哪边
+哪儿
+哪个
+哪里
+哪年
+哪怕
+哪天
+哪些
+哪样
+那
+那边
+那儿
+那个
+那会儿
+那里
+那么
+那么些
+那么样
+那时
+那些
+那样
+乃
+乃至
+呢
+能
+你
+你们
+您
+宁
+宁可
+宁肯
+宁愿
+哦
+呕
+啪达
+旁人
+呸
+凭
+凭借
+其
+其次
+其二
+其他
+其它
+其一
+其余
+其中
+起
+起见
+起见
+岂但
+恰恰相反
+前后
+前者
+且
+然而
+然后
+然则
+让
+人家
+任
+任何
+任凭
+如
+如此
+如果
+如何
+如其
+如若
+如上所述
+若
+若非
+若是
+啥
+上下
+尚且
+设若
+设使
+甚而
+甚么
+甚至
+省得
+时候
+什么
+什么样
+使得
+是
+是的
+首先
+谁
+谁知
+顺
+顺着
+似的
+虽
+虽然
+虽说
+虽则
+随
+随着
+所
+所以
+他
+他们
+他人
+它
+它们
+她
+她们
+倘
+倘或
+倘然
+倘若
+倘使
+腾
+替
+通过
+同
+同时
+哇
+万一
+往
+望
+为
+为何
+为了
+为什么
+为着
+喂
+嗡嗡
+我
+我们
+呜
+呜呼
+乌乎
+无论
+无宁
+毋宁
+嘻
+吓
+相对而言
+像
+向
+向着
+嘘
+呀
+焉
+沿
+沿着
+要
+要不
+要不然
+要不是
+要么
+要是
+也
+也罢
+也好
+一
+一般
+一旦
+一方面
+一来
+一切
+一样
+一则
+依
+依照
+矣
+以
+以便
+以及
+以免
+以至
+以至于
+以致
+抑或
+因
+因此
+因而
+因为
+哟
+用
+由
+由此可见
+由于
+有
+有的
+有关
+有些
+又
+于
+于是
+于是乎
+与
+与此同时
+与否
+与其
+越是
+云云
+哉
+再说
+再者
+在
+在下
+咱
+咱们
+则
+怎
+怎么
+怎么办
+怎么样
+怎样
+咋
+照
+照着
+者
+这
+这边
+这儿
+这个
+这会儿
+这就是说
+这里
+这么
+这么点儿
+这么些
+这么样
+这时
+这些
+这样
+正如
+吱
+之
+之类
+之所以
+之一
+只是
+只限
+只要
+只有
+至
+至于
+诸位
+着
+着呢
+自
+自从
+自个儿
+自各儿
+自己
+自家
+自身
+综上所述
+总的来看
+总的来说
+总的说来
+总而言之
+总之
+纵
+纵令
+纵然
+纵使
+遵照
+作为
+兮
+呃
+呗
+咚
+咦
+喏
+啐
+喔唷
+嗬
+嗯
+嗳
+打开天窗说亮话
+到目前为止
+赶早不赶晚
+常言说得好
+何乐而不为
+毫无保留地
+由此可见
+这就是说
+这么点儿
+综上所述
+总的来看
+总的来说
+总的说来
+总而言之
+相对而言
+除此之外
+反过来说
+恰恰相反
+如上所述
+换句话说
+具体地说
+具体说来
+另一方面
+与此同时
+一则通过
+毫无例外
+不然的话
+从此以后
+从古到今
+从古至今
+从今以后
+大张旗鼓
+从无到有
+从早到晚
+弹指之间
+不亦乐乎
+不知不觉
+不止一次
+不择手段
+不可开交
+不可抗拒
+不仅仅是
+不管怎样
+挨家挨户
+长此下去
+长话短说
+除此而外
+除此以外
+除此之外
+得天独厚
+川流不息
+长期以来
+挨门挨户
+挨门逐户
+多多少少
+多多益善
+二话不说
+更进一步
+二话没说
+分期分批
+风雨无阻
+归根到底
+归根结底
+反之亦然
+大面儿上
+倒不如说
+成年累月
+换句话说
+或多或少
+简而言之
+接连不断
+尽如人意
+尽心竭力
+尽心尽力
+尽管如此
+据我所知
+具体地说
+具体来说
+具体说来
+近几年来
+每时每刻
+屡次三番
+三番两次
+三番五次
+三天两头
+另一方面
+老老实实
+年复一年
+恰恰相反
+顷刻之间
+穷年累月
+千万千万
+日复一日
+如此等等
+如前所述
+如上所述
+一方面
+切不可
+顷刻间
+全身心
+另方面
+另一个
+猛然间
+默默地
+就是说
+近年来
+尽可能
+接下来
+简言之
+急匆匆
+即是说
+基本上
+换言之
+充其极
+充其量
+暗地里
+反之则
+比如说
+背地里
+背靠背
+并没有
+不得不
+不得了
+不得已
+不仅仅
+不经意
+不能不
+不外乎
+不由得
+不怎么
+不至于
+策略地
+差不多
+常言道
+常言说
+多年来
+多年前
+差一点
+敞开儿
+抽冷子
+大不了
+反倒是
+反过来
+大体上
+当口儿
+倒不如
+怪不得
+动不动
+看起来
+看上去
+看样子
+够瞧的
+到了儿
+呆呆地
+来不及
+来得及
+到头来
+连日来
+于是乎
+为什么
+这会儿
+换言之
+那会儿
+那么些
+那么样
+什么样
+反过来
+紧接着
+就是说
+要不然
+要不是
+一方面
+以至于
+自个儿
+自各儿
+之所以
+这么些
+这么样
+怎么办
+怎么样
+谁知
+顺着
+似的
+虽然
+虽说
+虽则
+随着
+所以
+他们
+他人
+它们
+她们
+倘或
+倘然
+倘若
+倘使
+要么
+要是
+也罢
+也好
+以便
+依照
+以及
+以免
+以至
+以致
+抑或
+因此
+因而
+因为
+由于
+有的
+有关
+有些
+于是
+与否
+与其
+越是
+云云
+一般
+一旦
+一来
+一切
+一样
+同时
+万一
+为何
+为了
+为着
+嗡嗡
+我们
+呜呼
+乌乎
+无论
+无宁
+沿着
+毋宁
+向着
+照着
+怎么
+咱们
+在下
+再说
+再者
+怎样
+这边
+这儿
+这个
+这里
+这么
+这时
+这些
+这样
+正如
+之类
+之一
+只是
+只限
+只要
+只有
+至于
+诸位
+着呢
+纵令
+纵然
+纵使
+遵照
+作为
+喔唷
+自从
+自己
+自家
+自身
+总之
+要不
+哎呀
+哎哟
+俺们
+按照
+吧哒
+罢了
+本着
+比方
+比如
+鄙人
+彼此
+别的
+别说
+并且
+不比
+不成
+不单
+不但
+不独
+不管
+不光
+不过
+不仅
+不拘
+不论
+不怕
+不然
+不如
+不特
+不惟
+不问
+不只
+朝着
+趁着
+除非
+除了
+此间
+此外
+从而
+但是
+当着
+的话
+等等
+叮咚
+对于
+多少
+而况
+而且
+而是
+而外
+而言
+而已
+尔后
+反之
+非但
+非徒
+否则
+嘎登
+各个
+各位
+各种
+各自
+根据
+故此
+固然
+关于
+果然
+果真
+哈哈
+何处
+何况
+何时
+哼唷
+呼哧
+还是
+还有
+或是
+或者
+极了
+及其
+及至
+即便
+即或
+即令
+即若
+即使
+既然
+既是
+继而
+加之
+假如
+假若
+假使
+鉴于
+几时
+较之
+接着
+结果
+进而
+尽管
+经过
+就是
+可见
+可是
+可以
+况且
+开始
+开外
+来着
+例如
+连同
+两者
+另外
+慢说
+漫说
+每当
+莫若
+某个
+某些
+哪边
+哪儿
+哪个
+哪里
+哪年
+哪怕
+哪天
+哪些
+哪样
+那边
+那儿
+那个
+那里
+那么
+那时
+那些
+那样
+乃至
+宁可
+宁肯
+宁愿
+你们
+啪达
+旁人
+凭借
+其次
+其二
+其他
+其它
+其一
+其余
+其中
+起见
+起见
+岂但
+前后
+前者
+然而
+然后
+然则
+人家
+任何
+任凭
+如此
+如果
+如何
+如其
+如若
+若非
+若是
+上下
+尚且
+设若
+设使
+甚而
+甚么
+甚至
+省得
+时候
+什么
+使得
+是的
+首先
+首先
+其次
+再次
+最后
+您们
+它们
+她们
+他们
+我们
+你是
+您是
+我是
+他是
+她是
+它是
+不是
+你们
+啊哈
+啊呀
+啊哟
+挨次
+挨个
+挨着
+哎呀
+哎哟
+俺们
+按理
+按期
+默然
+按时
+按说
+按照
+暗中
+暗自
+昂然
+八成
+倍感
+倍加
+本人
+本身
+本着
+并非
+别人
+必定
+比起
+比如
+比照
+鄙人
+毕竟
+必将
+必须
+并肩
+并没
+并排
+并且
+并无
+勃然
+不必
+不常
+不大
+不单
+不但
+而且
+不得
+不迭
+不定
+不独
+不对
+不妨
+不管
+不光
+不过
+不会
+不仅
+不拘
+不力
+不了
+不料
+不论
+不满
+不免
+不起
+不巧
+不然
+不日
+不少
+不胜
+不时
+不是
+不同
+不能
+不要
+不外
+不下
+不限
+不消
+不已
+不再
+不曾
+不止
+不只
+才能
+彻夜
+趁便
+趁机
+趁热
+趁势
+趁早
+趁着
+成心
+乘机
+乘势
+乘隙
+乘虚
+诚然
+迟早
+充分
+出来
+出去
+除此
+除非
+除开
+除了
+除去
+除却
+除外
+处处
+传说
+传闻
+纯粹
+此后
+此间
+此外
+此中
+次第
+匆匆
+从不
+从此
+从而
+从宽
+从来
+从轻
+从速
+从头
+从未
+从小
+从新
+从严
+从优
+从中
+从重
+凑巧
+存心
+达旦
+打从
+大大
+大抵
+大都
+大多
+大凡
+大概
+大家
+大举
+大略
+大约
+大致
+待到
+单纯
+单单
+但是
+但愿
+当场
+当儿
+当即
+当然
+当庭
+当头
+当下
+当真
+当中
+当着
+倒是
+到处
+到底
+到头
+得起
+的话
+的确
+等到
+等等
+顶多
+动辄
+陡然
+独自
+断然
+对于
+顿时
+多次
+多多
+多亏
+而后
+而论
+而且
+而是
+而外
+而言
+而已
+而又
+尔等
+反倒
+反而
+反手
+反之
+方才
+方能
+非常
+非但
+非得
+分头
+奋勇
+愤然
+更为
+更加
+根据
+个人
+各式
+刚才
+敢情
+该当
+嘎嘎
+否则
+赶快
+敢于
+刚好
+刚巧
+高低
+格外
+隔日
+隔夜
+公然
+过于
+果然
+果真
+光是
+关于
+共总
+姑且
+故此
+故而
+故意
+固然
+惯常
+毫不
+毫无
+很多
+何须
+好在
+何必
+何尝
+何妨
+何苦
+何况
+何止
+很少
+轰然
+后来
+呼啦
+哗啦
+互相
+忽地
+忽然
+话说
+或是
+伙同
+豁然
+恍然
+还是
+或许
+或者
+基本
+基于
+极大
+极度
+极端
+极力
+极其
+极为
+即便
+即将
+及其
+及至
+即刻
+即令
+即使
+几度
+几番
+几乎
+几经
+既然
+继而
+继之
+加上
+加以
+加之
+假如
+假若
+假使
+间或
+将才
+简直
+鉴于
+将近
+将要
+交口
+较比
+较为
+较之
+皆可
+截然
+截至
+藉以
+借此
+借以
+届时
+尽快
+近来
+进而
+进来
+进去
+尽管
+尽量
+尽然
+就算
+居然
+就此
+就地
+竟然
+究竟
+经常
+尽早
+精光
+经过
+就是
+局外
+举凡
+据称
+据此
+据实
+据说
+可好
+看来
+开外
+绝不
+决不
+据悉
+决非
+绝顶
+绝对
+绝非
+可见
+可能
+可是
+可以
+恐怕
+来讲
+来看
+快要
+况且
+拦腰
+牢牢
+老是
+累次
+累年
+理当
+理该
+理应
+例如
+立地
+立刻
+立马
+立时
+联袂
+连连
+连日
+路经
+临到
+连声
+连同
+连袂
+另外
+另行
+屡次
+屡屡
+缕缕
+率尔
+率然
+略加
+略微
+略为
+论说
+马上
+猛然
+没有
+每当
+每逢
+每每
+莫不
+莫非
+莫如
+莫若
+哪怕
+那么
+那末
+那些
+乃至
+难道
+难得
+难怪
+难说
+你们
+凝神
+宁可
+宁肯
+宁愿
+偶而
+偶尔
+碰巧
+譬如
+偏偏
+平素
+迫于
+扑通
+其次
+其后
+其实
+其它
+起初
+起来
+起首
+起头
+起先
+岂但
+岂非
+岂止
+恰逢
+恰好
+恰恰
+恰巧
+恰如
+恰似
+前后
+前者
+切莫
+切切
+切勿
+亲口
+亲身
+亲手
+亲眼
+亲自
+顷刻
+请勿
+取道
+权时
+全都
+全力
+全年
+全然
+然而
+然后
+人家
+人人
+仍旧
+仍然
+日见
+日渐
+日益
+日臻
+如常
+如次
+如果
+如今
+如期
+如若
+如上
+如下
+上来
+上去
+瑟瑟
+沙沙
+啊
+哎
+唉
+俺
+按
+吧
+把
+甭
+别
+嘿
+很
+乎
+会
+或
+既
+及
+啦
+了
+们
+你
+您
+哦
+砰
+啊
+你
+我
+他
+她
+它

+ 81 - 0
calCtr.py

@@ -0,0 +1,81 @@
+#coding utf-8
+import sys
+from operator import itemgetter
+import json
+import pandas as pd
+from db_help import RedisHelper
+
+if __name__=="__main__":
+    #1.load data
+    nowdate=sys.argv[1]
+    vlog='0'
+    love_live = 4
+    data_path = "./data/video_data_"+nowdate
+    f = open(data_path)
+    #data = pd.read_csv(data_path, encoding="utf-8", sep='\t')
+    #print(data)
+    index = 0
+    data_dict = {}
+    redis_helper = RedisHelper()
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        if index==0:
+            index += 1
+            continue
+        index +=1
+        items = line.strip().split("\t")
+        if len(items)<11:
+            continue
+        vid = items[1]
+        view_users = int(items[4])
+        view_pv = int(items[5])
+        play_users = int(items[6])
+        play_pv = int(items[7])
+        share_users = int(items[8])
+        share_pv = int(items[9])
+        return_users = int(items[10])
+        #print(items)
+        if vid not in data_dict:
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+        else:
+            item_info = data_dict[vid]
+            view_users = item_info[0]+view_users
+            view_pv = item_info[1]+view_pv
+            play_users = item_info[2]+play_users
+            play_pv = item_info[3]+play_pv
+            share_users = item_info[4]+share_users
+            share_pv = item_info[5]+share_pv
+            return_users = item_info[6]+return_users 
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+    #print(data_dict.items())
+    info_dict = {}
+    data_path = "./data/sorted_data_"+nowdate
+    f = open(data_path, 'w')
+    for k, v in data_dict.items():
+        #print(v)
+        return_users = v[6]
+        #print(return_users)
+        view_users = v[0]
+        view_pv = v[1]
+        share_pv = v[5]
+        share_users = v[4]
+        play_users = v[2]
+        play_pv = v[3]
+        #print("return_users:", return_users) 
+        k_score = float(return_users)/(float(view_users)+10)
+        k_score2 = float(return_users)/(float(view_pv)+10)
+        #print(k_score)
+        share_score = float(share_pv)/(float(view_pv)+5)
+        backrate = float(return_users)/(float(view_users)+5)
+        #print(k, k_score, share_score*backrate, share_score, backrate) 
+        score_info = [k_score2, share_score*backrate, share_score, backrate, view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users]
+        k = "k_p:"+k
+        score_info = json.dumps(score_info)
+        info_dict[k] = score_info
+        f.write(k+"\t"+score_info+"\n")
+    redis_helper.update_batch_setnx_key(info_dict, 60*60*24*15) 
+    f.close()
+
+     

+ 81 - 0
calCtr1days.py

@@ -0,0 +1,81 @@
+#coding utf-8
+import sys
+from operator import itemgetter
+import json
+import pandas as pd
+from db_help import RedisHelper
+
+if __name__=="__main__":
+    #1.load data
+    nowdate=sys.argv[1]
+    vlog='0'
+    love_live = 4
+    data_path = "./data/video_data_"+nowdate
+    f = open(data_path)
+    #data = pd.read_csv(data_path, encoding="utf-8", sep='\t')
+    #print(data)
+    index = 0
+    data_dict = {}
+    redis_helper = RedisHelper()
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        if index==0:
+            index += 1
+            continue
+        index +=1
+        items = line.strip().split("\t")
+        if len(items)<11:
+            continue
+        vid = int(items[1])
+        view_users = int(items[4])
+        view_pv = int(items[5])
+        play_users = int(items[6])
+        play_pv = int(items[7])
+        share_users = int(items[8])
+        share_pv = int(items[9])
+        return_users = int(items[10])
+        #print(items)
+        if vid not in data_dict:
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+        else:
+            item_info = data_dict[vid]
+            view_users = item_info[0]+view_users
+            view_pv = item_info[1]+view_pv
+            play_users = item_info[2]+play_users
+            play_pv = item_info[3]+play_pv
+            share_users = item_info[4]+share_users
+            share_pv = item_info[5]+share_pv
+            return_users = item_info[6]+return_users 
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+    #print(data_dict.items())
+    info_dict = {}
+    data_path = "./data/1_days_sorted_data_"+nowdate
+    f = open(data_path, 'w')
+    for k, v in data_dict.items():
+        #print(v)
+        return_users = v[6]
+        #print(return_users)
+        view_users = v[0]
+        view_pv = v[1]
+        share_pv = v[5]
+        share_users = v[4]
+        play_users = v[2]
+        #print("return_users:", return_users) 
+        k_score = float(return_users)/(float(view_users)+10)
+        k_score2 = float(return_users)/(float(view_pv)+10)
+        #print(k_score)
+        share_score = float(share_pv)/(float(view_pv)+50)
+        backrate = float(return_users)/(float(view_users)+10)
+        ctr_score = float(play_pv)/(float(view_pv)+50)
+        #print(k, k_score, share_score*backrate, share_score, backrate) 
+        score_info = [k_score2, share_score*backrate, share_score, backrate, ctr_score, view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users]
+        #k = "k_p3:"+k
+        score_info = json.dumps(score_info)
+        info_dict[k] = score_info
+        f.write(str(k)+"\t"+score_info+"\n")
+    #redis_helper.update_batch_setnx_key(info_dict, 60*60*24*15) 
+    f.close()
+
+     

+ 81 - 0
calCtr3days.py

@@ -0,0 +1,81 @@
+#coding utf-8
+import sys
+from operator import itemgetter
+import json
+import pandas as pd
+from db_help import RedisHelper
+
+if __name__=="__main__":
+    #1.load data
+    nowdate=sys.argv[1]
+    vlog='0'
+    love_live = 4
+    data_path = "./data/3_days_video_data_"+nowdate
+    f = open(data_path)
+    #data = pd.read_csv(data_path, encoding="utf-8", sep='\t')
+    #print(data)
+    index = 0
+    data_dict = {}
+    redis_helper = RedisHelper()
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        if index==0:
+            index += 1
+            continue
+        index +=1
+        items = line.strip().split("\t")
+        if len(items)<11:
+            continue
+        vid = items[1]
+        view_users = items[4] 
+        view_pv = items[5]
+        play_users = items[6]
+        play_pv = items[7]
+        share_users = items[8]
+        share_pv = items[9]
+        return_users = items[10]
+        #print(items)
+        if vid not in data_dict:
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+        else:
+            item_info = data_dict[vid]
+            view_users = item_info[0]+view_users
+            view_pv = item_info[1]+view_pv
+            play_users = item_info[2]+play_users
+            play_pv =  item_info[3]+play_pv
+            share_users = item_info[4]+share_users
+            share_pv = item_info[5]+share_pv
+            return_users = item_info[6]+return_users 
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+    #print(data_dict.items())
+    info_dict = {}
+    data_path = "./data/3_days_sorted_data_"+nowdate
+    f = open(data_path, 'w')
+    for k, v in data_dict.items():
+        #print(v)
+        return_users = v[6]
+        #print(return_users)
+        view_users = v[0]
+        view_pv = v[1]
+        share_pv = v[5]
+        share_users = v[4]
+        play_users = v[2]
+        #print("return_users:", return_users) 
+        k_score = float(return_users)/(float(view_users)+30)
+        k_score2 = float(return_users)/(float(view_pv)+30)
+        #print(k_score)
+        share_score = float(share_pv)/(float(view_pv)+100)
+        backrate = float(return_users)/(float(view_users)+30)
+        ctr_score = float(play_pv)/(float(view_pv)+100)
+        #print(k, k_score, share_score*backrate, share_score, backrate) 
+        score_info = [k_score2, share_score*backrate, share_score, backrate, ctr_score, view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users]
+        #k = "k_p3:"+k
+        score_info = json.dumps(score_info)
+        info_dict[k] = score_info
+        f.write(str(k)+"\t"+score_info+"\n")
+    #redis_helper.update_batch_setnx_key(info_dict, 60*60*24*15) 
+    f.close()
+
+     

+ 78 - 0
calCtr7days.py

@@ -0,0 +1,78 @@
+#coding utf-8
+import sys
+from operator import itemgetter
+import json
+import pandas as pd
+from db_help import RedisHelper
+
+if __name__=="__main__":
+    #1.load data
+    nowdate=sys.argv[1]
+    vlog='0'
+    love_live = 4
+    data_path = "./data/7_days_video_data_"+nowdate
+    f = open(data_path)
+    #data = pd.read_csv(data_path, encoding="utf-8", sep='\t')
+    #print(data)
+    index = 0
+    data_dict = {}
+    redis_helper = RedisHelper()
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        if index==0:
+            index += 1
+            continue
+        index +=1
+        items = line.strip().split("\t")
+        if len(items)<11:
+            continue
+        vid = items[1]
+        view_users = items[4] 
+        view_pv = items[5]
+        play_users = items[6]
+        play_pv = items[7]
+        share_users = items[8]
+        share_pv = items[9]
+        return_users = items[10]
+        #print(items)
+        if vid not in data_dict:
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+        else:
+            item_info = data_dict[vid]
+            view_users = item_info[0]+view_users
+            view_pv = item_info[1]+view_pv
+            play_users = item_info[2]+play_pv
+            share_users = item_info[3]+share_users
+            share_pv = item_info[4]+share_pv
+            return_users = item_info[5]+return_users 
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+    #print(data_dict.items())
+    info_dict = {}
+    data_path = "./data/7_days_sorted_data_"+nowdate
+    f = open(data_path, 'w')
+    for k, v in data_dict.items():
+        #print(v)
+        return_users = v[6]
+        #print(return_users)
+        view_users = v[0]
+        view_pv = v[1]
+        share_pv = v[5]
+        share_users = [4]
+        play_users = v[2]
+        #print("return_users:", return_users) 
+        k_score = float(return_users)/(float(view_users)+10)
+        #print(k_score)
+        share_score = float(share_pv)/(float(view_pv)+5)
+        backrate = float(return_users)/(float(view_users)+5)
+        #print(k, k_score, share_score*backrate, share_score, backrate) 
+        score_info = [share_score, share_score*backrate, share_score, backrate]
+        #k = "k_p4:"+k
+        score_info = json.dumps(score_info)
+        info_dict[k] = score_info
+        #f.write(k+"\t"+score_info+"\n")
+    #redis_helper.update_batch_setnx_key(info_dict, 60*60*24*15) 
+    f.close()
+
+     

+ 82 - 0
calHourCtr.py

@@ -0,0 +1,82 @@
+#coding utf-8
+import sys
+from operator import itemgetter
+import json
+import pandas as pd
+#from db_help import RedisHelper
+
+if __name__=="__main__":
+    #1.load data
+    nowdate=sys.argv[1]
+    vlog='0'
+    love_live = 4
+    data_path = "./data/hour_video_data_"+nowdate
+    f = open(data_path)
+    #data = pd.read_csv(data_path, encoding="utf-8", sep='\t')
+    #print(data)
+    index = 0
+    data_dict = {}
+    #redis_helper = RedisHelper()
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        if index==0:
+            index += 1
+            continue
+        index +=1
+        items = line.strip().split("\t")
+        #print(items)
+        if len(items)<9:
+            continue
+        vid = items[1]
+        view_users = int(items[2])
+        view_pv = int(items[3])
+        play_users = int(items[4])
+        play_pv = int(items[5])
+        share_users = int(items[6])
+        share_pv = int(items[7])
+        return_users = int(items[8])
+        #print(items)
+        if vid not in data_dict:
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+        else:
+            item_info = data_dict[vid]
+            view_users = item_info[0]+view_users
+            view_pv = item_info[1]+view_pv
+            play_users = item_info[2]+play_users
+            play_pv = item_info[3]+play_pv
+            share_users = item_info[4]+share_users
+            share_pv = item_info[5]+share_pv
+            return_users = item_info[6]+return_users 
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+    #print(data_dict.items())
+    f.close()
+    info_dict = {}
+    hour_data_path = "./data/sorted_hour_data_"+nowdate
+    f = open(hour_data_path, 'w')
+    for k, v in data_dict.items():
+        #print(v)
+        return_users = v[6]
+        #print(return_users)
+        view_users = v[0]
+        view_pv = v[1]
+        share_pv = v[5]
+        share_users = v[4]
+        play_users = v[2]
+        play_pv = v[3]
+        #print("return_users:", return_users) 
+        k_score = float(return_users)/(float(view_users)+5)
+        k_score2 = float(return_users)/(float(view_pv)+5)
+        #print(k_score)
+        share_score = float(share_pv)/(float(view_pv)+5)
+        backrate = float(return_users)/(float(view_users)+5)
+        #print(k, k_score, share_score*backrate, share_score, backrate) 
+        score_info = [k_score2, share_score*backrate, share_score, backrate, view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users]
+        score_info = json.dumps(score_info)
+        info_dict[k] = score_info
+        f.write(k+"\t"+score_info+"\n")
+    #redis_helper.update_batch_setnx_key(info_dict, 60*60*24*15) 
+    f.close()
+
+     

+ 81 - 0
calHourData.py

@@ -0,0 +1,81 @@
+#coding utf-8
+import sys
+from operator import itemgetter
+import json
+import pandas as pd
+#from db_help import RedisHelper
+
+if __name__=="__main__":
+    #1.load data
+    nowdate=sys.argv[1]
+    vlog='0'
+    love_live = 4
+    data_path = "./data/hour_video_data_"+nowdate
+    f = open(data_path)
+    #data = pd.read_csv(data_path, encoding="utf-8", sep='\t')
+    #print(data)
+    index = 0
+    data_dict = {}
+    #redis_helper = RedisHelper()
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        if index==0:
+            index += 1
+            continue
+        index +=1
+        items = line.strip().split("\t")
+        #print(items)
+        if len(items)<9:
+            continue
+        vid = items[1]
+        view_users = int(items[2])
+        view_pv = int(items[3])
+        play_users = int(items[4])
+        play_pv = int(items[5])
+        share_users = int(items[6])
+        share_pv = int(items[7])
+        return_users = int(items[8])
+        #print(items)
+        if vid not in data_dict:
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+        else:
+            item_info = data_dict[vid]
+            view_users = item_info[0]+view_users
+            view_pv = item_info[1]+view_pv
+            play_users = item_info[2]+play_users
+            play_pv =  item_info[3]+play_pv
+            share_users = item_info[4]+share_users
+            share_pv = item_info[5]+share_pv
+            return_users = item_info[6]+return_users 
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+    #print(data_dict.items())
+    f.close()
+    info_dict = {}
+    hour_data_path = "./data/sorted_hour_info_"+nowdate
+    f = open(hour_data_path, 'w')
+    for k, v in data_dict.items():
+        #print(v)
+        return_users = v[6]
+        #print(return_users)
+        view_users = v[0]
+        view_pv = v[1]
+        share_pv = v[5]
+        share_users = v[4]
+        play_users = v[2]
+        #print("return_users:", return_users) 
+        k_score = float(return_users)/(float(view_users)+5)
+        k_score2 = float(return_users)/(float(view_pv)+5)
+        #print(k_score)
+        share_score = float(share_pv)/(float(view_pv)+5)
+        backrate = float(return_users)/(float(view_users)+5)
+        #print(k, k_score, share_score*backrate, share_score, backrate) 
+        score_info = [k_score2, share_score*backrate, share_score, backrate, view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users]
+        score_info = json.dumps(score_info)
+        info_dict[k] = score_info
+        f.write(k+"\t"+score_info+"\n")
+    #redis_helper.update_batch_setnx_key(info_dict, 60*60*24*15) 
+    f.close()
+
+     

+ 142 - 0
calI2I.py

@@ -0,0 +1,142 @@
+#coding utf-8
+import sys
+from operator import itemgetter
+import json
+
+if __name__=="__main__":
+    #1.load data
+    nowdate=sys.argv[1]
+    f = open("./data/user_item_share_"+nowdate)
+    user_item_dict={}
+    item_dict = {}  
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        items = line.strip().split("\t")
+        if len(items)<3:
+            continue
+        key = (items[1],items[2])
+        #print(key)
+        if key not in user_item_dict:
+            user_item_dict[key] = 1
+        else:
+            user_item_dict[key] = user_item_dict[key]+1
+        if items[2] not in item_dict:
+            item_dict[items[2]] = 1
+        else:
+            item_dict[items[2]] = item_dict[items[2]]+1
+    f.close()
+    nowhour=sys.argv[2]
+    f1 = open("./data/user_cur_day_item_share_"+nowhour)
+    while True:
+        line = f1.readline()
+        if not line:
+            break
+        items = line.strip().split("\t")
+        if len(items)<3:
+            continue
+        key = (items[1],items[2])
+        #print(key)
+        if key not in user_item_dict:
+            user_item_dict[key] = 1
+        else:
+            user_item_dict[key] = user_item_dict[key]+1
+        if items[2] not in item_dict:
+            item_dict[items[2]] = 1
+        else:
+            item_dict[items[2]] = item_dict[items[2]]+1
+    f1.close()
+    #((user,item), score)
+    #print(user_item_dict)
+    #2. (uid, [(vid, score)....])
+    user_group_dict = {}
+    for k, v in user_item_dict.items():
+        uid = k[0]
+        vid = k[1]
+        score = v
+        vid_list = []
+        if uid not in user_group_dict:
+            vid_list.append((vid, score))
+            user_group_dict[uid] = vid_list
+        else:
+            vid_list = user_group_dict[uid]
+            vid_list.append((vid, score))
+            user_group_dict[uid] = vid_list
+    #print(user_group_dict)
+    item_pair_dict = {}
+    #3. expand item
+    for k, v_list in user_group_dict.items():
+         v_n = len(v_list)
+         if v_n<2:
+             continue
+         for i in range(v_n):
+             for j in range(1, v_n):
+                if v_list[i][0] == v_list[j][0]:
+                    continue
+                item_key = (v_list[i][0], v_list[j][0])
+                item_score = min(v_list[i][1], v_list[j][1])
+                if item_key not in item_pair_dict:
+                    item_pair_dict[item_key] = item_score
+                else:
+                    item_pair_dict[item_key] = item_pair_dict[item_key]+item_score
+    #print(item_pair_dict)
+    print(len(item_pair_dict))
+    print(len(item_dict))
+    left_pair_num = 0
+    rec_item_dict = {}
+    #4. rec item
+    for k, v in item_pair_dict.items():
+        if v<2:
+            continue
+        left_pair_num+=1
+        item1 = k[0]
+        item2 = k[1]
+        pair_score = v
+        if item1 in item_dict:
+             item_score1 = item_dict[item1]
+             i2i_pro = pair_score/(item_score1+5)
+             rec_list1 = []
+             if item2 not in rec_item_dict:
+                 rec_list1.append((item1, i2i_pro, pair_score, item_score1))
+                 rec_item_dict[item2] = rec_list1
+             else:
+                 rec_list1 = rec_item_dict[item2]
+                 rec_list1.append((item1, i2i_pro, pair_score, item_score1))
+                 rec_item_dict[item2] = rec_list1
+        if item2 in item_dict:
+             item_score2 = item_dict[item2]
+             i2i_pro = pair_score/(item_score2+5)
+             rec_list2 = []
+             if item1 not in rec_item_dict:
+                 rec_list2.append((item2, i2i_pro, pair_score, item_score2))
+                 rec_item_dict[item1] = rec_list2
+             else:
+                 rec_list2 = rec_item_dict[item1]
+                 rec_list2.append((item2, i2i_pro, pair_score, item_score2))
+                 rec_item_dict[item1] = rec_list2          
+     
+    #(item, share_count)
+    print(left_pair_num)
+    #print(rec_item_dict)
+    final_rec_list = []
+    #f = open("rec_result", "w")
+    #5. sorted item_list
+    
+    for k,v in rec_item_dict.items():
+        v_set = set('')
+        value_list = v
+        dup_list = []
+        for item in value_list:
+            if item[0] in v_set:
+                continue
+            v_set.add(item[0])
+            dup_list.append(item)
+        sorted_v = sorted(dup_list, key=itemgetter(1), reverse=True)
+        final_rec_list.append((k, sorted_v))
+    #print(final_rec_list[:1])
+    #json_str = json.dumps(final_rec_list)
+    with open("./data/rec_result_"+nowhour+".json", "w") as f :
+        json.dump(final_rec_list, f)
+    
+     

+ 176 - 0
calI2I2.py

@@ -0,0 +1,176 @@
+#coding utf-8
+import sys
+from operator import itemgetter
+import json
+
+if __name__=="__main__":
+    #1.load data
+    nowdate=sys.argv[1]
+    f = open("./data/user_item_share_filter_"+nowdate)
+    user_item_dict={}
+    item_dict = {}  
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        items = line.strip().split("\t")
+        if len(items)<3:
+            continue
+        vid = -1
+        try:
+            vid = int(items[2])
+        except:
+            continue
+        if vid == -1:
+            continue
+        key = (items[1],vid)
+        #print(key)
+        if key not in user_item_dict:
+            user_item_dict[key] = 1
+        else:
+            user_item_dict[key] = user_item_dict[key]+1
+        if vid not in item_dict:
+            item_dict[vid] = 1
+        else:
+            item_dict[vid] = item_dict[vid]+1
+    f.close()
+    nowhour=sys.argv[2]
+    f1 = open("./data/user_cur_day_item_share_filter_"+nowhour)
+    while True:
+        line = f1.readline()
+        if not line:
+            break
+        items = line.strip().split("\t")
+        if len(items)<3:
+            continue
+        vid = -1
+        try:
+            vid = int(items[2])
+        except:
+            continue
+        if vid == -1:
+            continue
+
+        key = (items[1],vid)
+        #print(key)
+        if key not in user_item_dict:
+            user_item_dict[key] = 1
+        else:
+            user_item_dict[key] = user_item_dict[key]+1
+        if vid not in item_dict:
+            item_dict[vid] = 1
+        else:
+            item_dict[vid] = item_dict[vid]+1
+    f1.close()
+
+    #((user,item), score)
+    #print(user_item_dict)
+    #2. (uid, [(vid, score)....])
+    user_group_dict = {}
+    for k, v in user_item_dict.items():
+        uid = k[0]
+        vid = k[1]
+        score = v
+        #if score <3:
+        #    continue
+        vid_list = []
+        if uid not in user_group_dict:
+            vid_list.append((vid, score))
+            user_group_dict[uid] = vid_list
+        else:
+            vid_list = user_group_dict[uid]
+            vid_list.append((vid, score))
+            user_group_dict[uid] = vid_list
+    #print(user_group_dict)
+    item_pair_dict = {}
+    #3. expand item
+    for k, v_list in user_group_dict.items():
+         v_n = len(v_list)
+         if v_n<2:
+             continue
+         for i in range(v_n):
+             for j in range(1, v_n):
+                if v_list[i][0] == v_list[j][0]:
+                    continue
+                item_key = (v_list[i][0], v_list[j][0])
+                item_score = 1
+                if item_key not in item_pair_dict:
+                    item_pair_dict[item_key] = item_score
+                else:
+                    item_pair_dict[item_key] = item_pair_dict[item_key]+item_score
+    print(len(item_pair_dict))
+    #print(item_pair_dict)
+    #print(item_dict)
+    left_pair_num = 0
+    rec_item_dict = {}
+    #4. rec item
+    for k, v in item_pair_dict.items():
+        if v<3:
+            continue
+        left_pair_num+=1
+        #print(k[0])
+        #print(k[1])
+        
+        item1 = int(k[0])
+        item2 = int(k[1])
+        pair_score = v
+        if item1 in item_dict:
+            item_score1 = item_dict[item1]
+            #if item_score1<10:
+            #    continue
+            item_score1 = 1
+            i2i_pro = float(pair_score)/(float(item_score1)+5)
+            if i2i_pro<0.000001:
+                continue
+            rec_list1 = []
+            if item2 not in rec_item_dict:
+                rec_list1.append((item1, i2i_pro, pair_score, item_score1))
+                rec_item_dict[item2] = rec_list1
+            else:
+                rec_list1 = rec_item_dict[item2]
+                rec_list1.append((item1, i2i_pro, pair_score, item_score1))
+                rec_item_dict[item2] = rec_list1
+        if item2 in item_dict:
+            item_score2 = item_dict[item2]
+            #if item_score2<10:
+            #    continue
+            item_score2 = 1.0
+            i2i_pro = float(pair_score)/(float(item_score2)+5)
+            if i2i_pro<0.000001:
+                continue
+            rec_list2 = []
+            if item1 not in rec_item_dict:
+                rec_list2.append((item2, i2i_pro, pair_score, item_score2))
+                rec_item_dict[item1] = rec_list2
+            else:
+                rec_list2 = rec_item_dict[item1]
+                rec_list2.append((item2, i2i_pro, pair_score, item_score2))
+                rec_item_dict[item1] = rec_list2   
+     
+    #(item, share_count)
+    print(left_pair_num)
+    #print(rec_item_dict)
+    final_rec_list = []
+    #f = open("rec_result", "w")
+    #5. sorted item_list
+    
+    for k,v in rec_item_dict.items():
+        v_set = set('')
+        value_list = v
+        dup_list = []
+        for item in value_list:
+            if item[0] in v_set:
+                continue
+            v_set.add(item[0])
+            #print(item[1])
+            #if float(items[1])<0.000001:
+            #    continue
+            dup_list.append(item)
+        sorted_v = sorted(dup_list, key=itemgetter(1), reverse=True)
+        final_rec_list.append((k, sorted_v))
+    #print(final_rec_list[:1])
+    #json_str = json.dumps(final_rec_list)
+    with open("./data/rec_result3_"+nowhour+".json", "w") as f :
+        json.dump(final_rec_list, f)
+    
+     

+ 124 - 0
calI2I3.py

@@ -0,0 +1,124 @@
+#coding utf-8
+import sys
+from operator import itemgetter
+import json
+
+if __name__=="__main__":
+    #1.load data
+    nowdate=sys.argv[1]
+    f = open("./data/user_item_share_"+nowdate)
+    user_item_dict={}
+    item_dict = {}  
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        items = line.strip().split("\t")
+        if len(items)<3:
+            continue
+        key = (items[1],items[2])
+        #print(key)
+        if key not in user_item_dict:
+            user_item_dict[key] = 1
+        else:
+            user_item_dict[key] = user_item_dict[key]+1
+        if items[2] not in item_dict:
+            item_dict[items[2]] = 1
+        else:
+            item_dict[items[2]] = item_dict[items[2]]+1
+    f.close()
+    #((user,item), score)
+    #print(user_item_dict)
+    #2. (uid, [(vid, score)....])
+    user_group_dict = {}
+    for k, v in user_item_dict.items():
+        uid = k[0]
+        vid = k[1]
+        score = v
+        #if score <3:
+        #    continue
+        vid_list = []
+        if uid not in user_group_dict:
+            vid_list.append((vid, score))
+            user_group_dict[uid] = vid_list
+        else:
+            vid_list = user_group_dict[uid]
+            vid_list.append((vid, score))
+            user_group_dict[uid] = vid_list
+    #print(user_group_dict)
+    item_pair_dict = {}
+    #3. expand item
+    for k, v_list in user_group_dict.items():
+         v_n = len(v_list)
+         if v_n<2:
+             continue
+         for i in range(v_n):
+             for j in range(1, v_n):
+                if v_list[i][0] == v_list[j][0]:
+                    continue
+                item_key = (v_list[i][0], v_list[j][0])
+                item_score = 1
+                if item_key not in item_pair_dict:
+                    item_pair_dict[item_key] = item_score
+                else:
+                    item_pair_dict[item_key] = item_pair_dict[item_key]+item_score
+    #print(item_pair_dict)
+    print(item_pair_dict)
+    print(item_dict)
+    left_pair_num = 0
+    rec_item_dict = {}
+    #4. rec item
+    for k, v in item_pair_dict.items():
+        if v<2:
+            continue
+        left_pair_num+=1
+        item1 = k[0]
+        item2 = k[1]
+        pair_score = v
+        if item1 in item_dict:
+             item_score1 = item_dict[item1]
+             i2i_pro = pair_score/(item_score1+5)
+             rec_list1 = []
+             if item2 not in rec_item_dict:
+                 rec_list1.append((item1, i2i_pro, pair_score, item_score1))
+                 rec_item_dict[item2] = rec_list1
+             else:
+                 rec_list1 = rec_item_dict[item2]
+                 rec_list1.append((item1, i2i_pro, pair_score, item_score1))
+                 rec_item_dict[item2] = rec_list1
+        if item2 in item_dict:
+             item_score2 = item_dict[item2]
+             i2i_pro = pair_score/(item_score2+5)
+             rec_list2 = []
+             if item1 not in rec_item_dict:
+                 rec_list2.append((item2, i2i_pro, pair_score, item_score2))
+                 rec_item_dict[item1] = rec_list2
+             else:
+                 rec_list2 = rec_item_dict[item1]
+                 rec_list2.append((item2, i2i_pro, pair_score, item_score2))
+                 rec_item_dict[item1] = rec_list2          
+     
+    #(item, share_count)
+    print(left_pair_num)
+    #print(rec_item_dict)
+    final_rec_list = []
+    #f = open("rec_result", "w")
+    #5. sorted item_list
+    
+    for k,v in rec_item_dict.items():
+        v_set = set('')
+        value_list = v
+        dup_list = []
+        for item in value_list:
+            if item[0] in v_set:
+                continue
+            v_set.add(item[0])
+            dup_list.append(item)
+        sorted_v = sorted(dup_list, key=itemgetter(1), reverse=True)
+        final_rec_list.append((k, sorted_v))
+    #print(final_rec_list[:1])
+    #json_str = json.dumps(final_rec_list)
+    with open("./data/rec_result2_"+nowdate+".json", "w") as f :
+        json.dump(final_rec_list, f)
+    
+     

+ 32 - 0
clean.sh

@@ -0,0 +1,32 @@
+ #!/bin/bash
+source ~/.bash_profile
+source ~/.bashrc
+
+last3day=`date  +"%Y%m%d" -d -4days`
+rec_result_path=./data/rec_result3_${last3day}'*'
+rec_cur_day_item_path=./data/user_cur_day_item_share_filter_${last3day}'*'
+merge_path=./data/merge_score_${last3day}'*'
+user_item_share_filter_path=./data/user_item_share_filter_${last3day}'*'
+video_data_path=./data/video_data_${last3day}'*'
+sorted_path=./data/sorted_data_${last3day}'*'
+cls_path=./data/redis_cls_${last3day}'*'
+hour_video_path=./data/hour_video_data_${last3day}'*'
+sorted_hour_path=./data/sorted_hour_data_${last3day}'*'
+rec_path=./data/rec_result_'*'
+user_cur_day_path=./data/user_cur_day_item_share_${last3day}'*'
+#user_cur_d=./data/user_cur_day_item_share_${last3day}'*'
+echo ${merge_path}
+echo ${video_data_path}
+echo ${cls_path}
+
+rm -rf ${user_cur_day_path}
+rm -rf ${user_item_share_filter_path}
+rm -rf ${rec_cur_day_item_path}
+rm -rf ${rec_result_path}
+rm -rf ${merge_path}
+rm -rf ${video_data_path}
+rm -rf ${sorted_path}
+rm -rf ${cls_path}
+rm -rf ${hour_video_path}
+rm -rf ${sorted_hour_path}
+rm -rf ${rec_path}

+ 62 - 0
compose_score.py

@@ -0,0 +1,62 @@
+#coding utf-8
+import sys
+import json
+from db_help import RedisHelper
+
+if __name__=="__main__":
+    nowdate = sys.argv[1]
+    f1 = open("./data/sorted_hour_data_"+nowdate)
+    f2 = open("./data/sorted_data_"+nowdate)
+    data_dict = {}
+    while True:
+        line = f1.readline()
+        if not line:
+            break
+        items = line.strip().split("\t")
+        if len(items)<2:
+            continue
+        kid = items[0]
+        #print(items[1])
+        item_info = json.loads(items[1])
+        data_dict[kid] = item_info
+    f1.close()
+    f3 = open("./data/merge_score_"+nowdate, 'w')
+    info_dict = {}
+    while True:
+        line = f2.readline()
+        if not line:
+            break
+        items = line.split("\t")
+        if len(items)<2:
+            continue
+        kid = items[0].replace("k_p:", "")
+        kid2 = "k_p2:"+kid
+        #print(kid)
+        d_item_info = json.loads(items[1])
+        if kid in data_dict:
+            item_info = data_dict[kid]
+            #print("h:",item_info)
+            #print("d:",d_item_info)
+            total_info = []
+            for i in range(len(item_info)):
+                total_info.append(0.001*item_info[i]+d_item_info[i])
+            if len(total_info)>2:
+                total_info[0] = total_info[0]+0.1*total_info[3]
+            total_item_info = json.dumps(total_info)
+            f3.write(kid2+"\t"+total_item_info+"\n")
+            info_dict[kid2] = total_item_info
+            #print("m:",total_item_info)
+        else:
+            total_info = []
+            for i  in range(len(d_item_info)):
+                total_info.append(d_item_info[i])
+            if len(total_info)>2:
+                total_info[0] = total_info[1]+0.1*total_info[3]
+            total_item_info = json.dumps(total_info)
+            f3.write(kid2+"\t"+total_item_info+"\n")
+            info_dict[kid2] = total_item_info
+    print(info_dict)
+    if len(info_dict)>0:
+        redis_helper = RedisHelper()
+        redis_helper.update_batch_setnx_key(info_dict, 60*60*24*15)
+    f2.close()

+ 97 - 0
compose_score_3day.py

@@ -0,0 +1,97 @@
+#coding utf-8
+import sys
+import json
+from db_help import RedisHelper
+
+if __name__=="__main__":
+    nowdate = sys.argv[1]
+    f1 = open("./data/sorted_hour_info_"+nowdate)
+    f2 = open("./data/1_days_sorted_data_"+nowdate)
+    f3 = open("./data/3_days_sorted_data_"+nowdate)
+    data_dict = {}
+    while True:
+        line = f1.readline()
+        if not line:
+            break
+        items = line.strip().split("\t")
+        if len(items)<2:
+            continue
+        kid = items[0]
+        #print(items[1])
+        item_info = json.loads(items[1])
+        data_dict[kid] = item_info
+    f1.close()
+    #f3 = open("./data/merge_score_"+nowdate, 'w')
+    info_dict = {}
+    while True:
+        line = f2.readline()
+        if not line:
+            break
+        items = line.split("\t")
+        if len(items)<2:
+            continue
+        kid = items[0]
+        d_item_info = json.loads(items[1])
+        if kid in data_dict:
+            item_info = data_dict[kid]
+            total_info = []
+            for i in range(len(item_info)):
+                total_info.append(0.6*float(item_info[i])+0.4*float(d_item_info[i]))
+            info_dict[kid] = total_info
+        else:
+            total_info = []
+            for i  in range(len(d_item_info)):
+                total_info.append(float(d_item_info[i]))
+            #if len(total_info)>2:
+            #    total_info[0] = total_info[1]+0.1*total_info[3]
+            #total_item_info = json.dumps(total_info)
+            #f3.write(kid2+"\t"+total_item_info+"\n")
+            info_dict[kid] = total_info
+    #print(info_dict)
+    print("info:", len(info_dict))
+    day3_dict = {}
+    while True:
+        line = f3.readline()
+        if not line:
+            break
+        items = line.split("\t")
+        if len(items)<2:
+            continue
+        kid = items[0]
+        d_item_info = json.loads(items[1])
+        if kid in info_dict:
+            item_info = info_dict[kid]
+            total_info = []
+            for i in range(len(item_info)):
+                total_info.append(0.7*float(item_info[i])+0.3*float(d_item_info[i]))
+            day3_dict[kid] = total_info
+        else:
+            total_info = []
+            for i  in range(len(d_item_info)):
+                total_info.append(float(d_item_info[i]))
+            day3_dict[kid] = total_info
+    print("info3:", len(day3_dict))
+    f3 = open("./data/merge_3_days_score_"+nowdate, 'w')
+    res_dict = {}
+    res_dict2 = {}
+    for k, v in day3_dict.items():
+        score = v[0]
+        new_arr = []
+        new_arr.append(score)
+        for i in range(4):
+            new_arr.append(v[i])
+        #print(v)
+        #print(new_arr)
+        total_item_info = json.dumps(new_arr)
+        kid2 = "kp_3:"+k
+        kid3 = "kp_4:"+k
+        f3.write(kid2+"\t"+total_item_info+"\n")
+        #res_dict[kid2] = total_item_info
+        res_dict2[kid3] = total_item_info
+    #if len(res_dict)>0:
+    #    redis_helper = RedisHelper()
+    #    redis_helper.update_batch_setnx_key(res_dict, 60*60*24*15)
+    if len(res_dict2)>0:
+        redis_helper = RedisHelper()
+        redis_helper.update_batch_setnx_key(res_dict2, 60*60*24*3)
+    f2.close()

+ 361 - 0
config.py

@@ -0,0 +1,361 @@
+import os
+# from log import Log
+# log_ = Log()
+
+class BaseConfig(object):
+    # 产品标识
+    APP_TYPE = {
+        'VLOG': 0,  # vlog
+        'LOVE_LIVE': 4,  # 票圈视频
+        'LONG_VIDEO': 5,  # 内容精选
+        'SHORT_VIDEO': 6,  # 票圈短视频
+        'WAN_NENG_VIDEO': 17,  # 万能影视屋
+        'LAO_HAO_KAN_VIDEO': 18,  # 老好看视频
+        'ZUI_JING_QI': 19,  # 票圈最惊奇
+        'APP': 13,  # 票圈视频APP
+        'PIAO_QUAN_VIDEO_PLUS': 21,  # 票圈视频+
+        'JOURNEY': 22,  # 票圈足迹
+        'BLESSING_YEAR': 3,  # 票圈福年
+    }
+    # ODPS服务配置
+    ODPS_CONFIG = {
+        'ENDPOINT': 'http://service.cn.maxcompute.aliyun.com/api',
+        'ACCESSID': 'LTAIWYUujJAm7CbH',
+        'ACCESSKEY': 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P',
+    }
+    
+    # 日志服务配置
+    ALIYUN_LOG = {
+        'ENDPOINT': 'cn-hangzhou-intranet.log.aliyuncs.com',
+        'ACCESSID': 'LTAIWYUujJAm7CbH',
+        'ACCESSKEY': 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P',
+        'PROJECT': 'rov-server',
+    }
+  
+    REDIS_INFO = {
+        'host': 'r-bp1fogs2mflr1ybfot.redis.rds.aliyuncs.com',
+        'port': 6379,
+        'password': 'Wqsd@2019',
+    }
+ 
+    # 小年糕视频redis存储key
+    XNG_KEY_NAME = 'xng:videos'
+    # 特殊地区屏蔽危险视频redis存储key
+    SPECIAL_AREA_LIMIT_KEY_NAME = 'special:area:limit:videos'
+    #24 hour update 
+    PROJECT_REGION_APP_TYPE = 'loghubods'
+    TABLE_REGION_APP_TYPE = 'video_each_hour_update_province_apptype'
+    
+    CITY_CODE = {
+        '广州': '440100', '深圳': '440300', '成都': '510100', '长沙': '430100',
+    }
+    DATA_PARAMS = {
+        'data1': {APP_TYPE['VLOG']: 0},  # vlog
+        'data2': {APP_TYPE['VLOG']: 0.5, APP_TYPE['LONG_VIDEO']: 0.5},  # [vlog, 内容精选]
+        'data3': {APP_TYPE['VLOG']: 0.5, APP_TYPE['LOVE_LIVE']: 0.5},  # [vlog, 票圈视频]
+        'data4': {APP_TYPE['VLOG']: 0.5, APP_TYPE['SHORT_VIDEO']: 0.5},  # [vlog, 票圈短视频]
+        # 'data5': [APP_TYPE['VLOG'], APP_TYPE['ZUI_JING_QI']],  # [vlog, 最惊奇]
+        'data6': {APP_TYPE['VLOG']: 0.25, APP_TYPE['LOVE_LIVE']: 0.25, APP_TYPE['SHORT_VIDEO']: 0.25,
+                  APP_TYPE['LONG_VIDEO']: 0.25},
+        'data7': {APP_TYPE['VLOG']: 0.5, APP_TYPE['APP']: 0.5},  # [vlog, 票圈视频APP]
+        'data8': {APP_TYPE['VLOG']: 0.7, APP_TYPE['LONG_VIDEO']: 0.3},  # [vlog, 内容精选]
+        'data9': {APP_TYPE['VLOG']: 0.3, APP_TYPE['LONG_VIDEO']: 0.7},  # [vlog, 内容精选]
+        'data10': {APP_TYPE['VLOG']: 0.2, APP_TYPE['LOVE_LIVE']: 0.8},  # [vlog, 票圈视频]
+        'data11': {APP_TYPE['VLOG']: 0.3, APP_TYPE['LOVE_LIVE']: 0.7},  # [vlog, 票圈视频]
+        'data12': {APP_TYPE['VLOG']: 0.4, APP_TYPE['SHORT_VIDEO']: 0.6},  # [vlog, 票圈短视频]
+        'data13': {APP_TYPE['VLOG']: 0.3, APP_TYPE['SHORT_VIDEO']: 0.7},  # [vlog, 票圈短视频]
+        'data14': {APP_TYPE['VLOG']: 0.78, APP_TYPE['LOVE_LIVE']: 0.11, APP_TYPE['SHORT_VIDEO']: 0.08,
+                   APP_TYPE['LONG_VIDEO']: 0.03},
+
+    }
+
+    REGION_CODE = {
+        '北京': '110000', '天津': '120000', '河北省': '130000', '山西省': '140000', '内蒙古': '150000',
+        '辽宁省': '210000', '吉林省': '220000', '黑龙江省': '230000',
+        '上海': '310000', '江苏省': '320000', '浙江省': '330000', '安徽省': '340000', '福建省': '350000', '江西省': '360000', '山东省': '370000',
+        '河南省': '410000', '湖北省': '420000', '湖南省': '430000', '广东省': '440000', '广西': '450000', '海南省': '460000',
+        '重庆': '500000',  '四川省': '510000', '贵州省': '520000', '云南省': '530000', '西藏': '540000',
+        '陕西省': '610000', '甘肃省': '620000', '青海省': '630000', '宁夏': '640000', '新疆': '650000',
+        '台湾省': '710000', '香港': '810000', '澳门': '820000',
+        'None': '-1'
+    } 
+     # 屏蔽视频配置实验组
+    SHIELD_CONFIG2 = {
+        REGION_CODE['北京']: [XNG_KEY_NAME, SPECIAL_AREA_LIMIT_KEY_NAME, ],
+        REGION_CODE['None']: [SPECIAL_AREA_LIMIT_KEY_NAME, XNG_KEY_NAME, ],
+        CITY_CODE['广州']: [SPECIAL_AREA_LIMIT_KEY_NAME, ],
+        CITY_CODE['深圳']: [SPECIAL_AREA_LIMIT_KEY_NAME, ],
+        CITY_CODE['成都']: [SPECIAL_AREA_LIMIT_KEY_NAME, ],
+        CITY_CODE['长沙']: [SPECIAL_AREA_LIMIT_KEY_NAME, ],
+    }
+ 
+    RULE_PARAMS_REGION_APP_TYPE_48H = {
+        'rule_params': {
+            'rule5': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+                      'region_24h_rule_key': 'rule2', '48h_rule_key': 'rule1'},
+        },
+        'data_params': {
+            'data1': [APP_TYPE['VLOG'], ],
+        },
+        'params_list': [
+            {'data': 'data1', 'rule': 'rule5'},
+        ],
+    }
+    
+     # 地域分组小时级规则参数
+    RULE_PARAMS_REGION_APP_TYPE = {
+        'rule_params': {
+            # 'rule2': {'view_type': 'video-show', 'platform_return_rate': 0.001, 'region_24h_rule_key': 'rule2'},
+            # 'rule3': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+            #           'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule2'},
+            'rule4': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+                      'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule3'},
+            # 涉政视频过滤
+            'rule4-1': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+                        'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule3', 'political_filter': True},
+            # 特殊地域屏蔽危险视频
+            'rule4-2': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+                        'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule3', 'shield_config': SHIELD_CONFIG2},
+
+            # 'rule6': {'view_type': 'preview', 'platform_return_rate': 0.001,
+            #           'region_24h_rule_key': 'rule3', '24h_rule_key': 'rule2'},
+            'rule7': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+                      'region_24h_rule_key': 'rule4', '24h_rule_key': 'rule4', 'merge_func': 2},
+            'rule7-1': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+                        'region_24h_rule_key': 'rule4', '24h_rule_key': 'rule4', 'merge_func': 2,
+                        'political_filter': True},
+            'rule8': {'view_type': 'preview', 'platform_return_rate': 0.001,
+                      'region_24h_rule_key': 'rule5', '24h_rule_key': 'rule4', 'merge_func': 2},
+            # 'rule9': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+            #           'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule3', '30day_rule_key': 'rule1'},
+            # # 无回流人群
+            # 'rule10': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+            #            'region_24h_rule_key': 'rule6', '24h_rule_key': 'rule5', 'click_score_rate': 0.7},
+            # 'rule13': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+            #            'region_24h_rule_key': 'rule8', '24h_rule_key': 'rule7', 'click_score_rate': 0.8},
+            # # 有回流人群
+            # 'rule11': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+            #            'region_24h_rule_key': 'rule7', '24h_rule_key': 'rule6', 'back_score_rate': 0.7},
+            # 'rule14': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+            #            'region_24h_rule_key': 'rule9', '24h_rule_key': 'rule8', 'back_score_rate': 0.8},
+            # # 20点地域小时级列表中增加7点-19点地域小时级的优质视频
+            # 'rule12': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+            #            'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule3', 'add_videos_in_20h': True},
+
+            # 地域小时级列表中增加 前6小时 地域小时级的优质视频
+            'rule15': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+                       'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule3',
+                       'add_videos_with_pre_h': True, 'hour_count': 6},
+            # 地域小时级列表中增加 前2小时 地域小时级的优质视频,排序优化1:半小时级列表中有的视频以本小时的分数为准
+            'rule16': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+                       'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule3',
+                       'add_videos_with_pre_h': True, 'hour_count': 2, 'add_func': 'func2'},
+            # 地域小时级列表中增加 前47小时 地域小时级的优质视频
+            'rule17': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+                       'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule3',
+                       'add_videos_with_pre_h': True, 'hour_count': 47},
+            # 地域小时级列表中增加 前3小时 地域小时级的优质视频,排序优化1:半小时级列表中有的视频以本小时的分数为准
+            'rule18': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+                       'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule3',
+                       'add_videos_with_pre_h': True, 'hour_count': 3, 'add_func': 'func2'},
+
+        },
+        'data_params': DATA_PARAMS,
+        'params_list': [
+            {'data': 'data1', 'rule': 'rule4'},  # 095 vlog
+            {'data': 'data1', 'rule': 'rule4-1'},  # 095-1
+            {'data': 'data1', 'rule': 'rule4-2'},  # 262 特殊地域屏蔽危险视频
+            # {'data': 'data2', 'rule': 'rule4'},
+            {'data': 'data2', 'rule': 'rule7-1'},  # 121 内容精选
+            # {'data': 'data3', 'rule': 'rule7'},
+            # {'data': 'data4', 'rule': 'rule7'},
+            # {'data': 'data6', 'rule': 'rule7'},
+            {'data': 'data7', 'rule': 'rule8'},  # 票圈视频APP 10003.110156
+            # {'data': 'data1', 'rule': 'rule9'},
+            # {'data': 'data1', 'rule': 'rule10'},
+            # {'data': 'data1', 'rule': 'rule11'},
+            # {'data': 'data8', 'rule': 'rule7'},
+            # {'data': 'data9', 'rule': 'rule7'},
+            {'data': 'data10', 'rule': 'rule7'},  # 144 票圈视频
+            # {'data': 'data11', 'rule': 'rule7'},
+            # {'data': 'data12', 'rule': 'rule7'},
+            # {'data': 'data13', 'rule': 'rule7'},
+            # {'data': 'data1', 'rule': 'rule12'},
+            # {'data': 'data14', 'rule': 'rule7'},  # 159
+            # {'data': 'data1', 'rule': 'rule13'},  # 161
+            # {'data': 'data1', 'rule': 'rule14'},  # 162
+            # {'data': 'data1', 'rule': 'rule15'},  # 200 vlog
+            # {'data': 'data1', 'rule': 'rule16'},  # 214 vlog
+            # {'data': 'data1', 'rule': 'rule17'},  # 215 vlog
+            # {'data': 'data1', 'rule': 'rule18'},  # 224 vlog
+        ],
+    }
+
+class TestConfig(BaseConfig):
+    """测试环境配置"""
+    # 报警内容 环境区分
+    ENV_TEXT = "测试环境"
+    # 项目存放目录
+    PROJECT_PATH = '/data2/rov-offline'
+
+    # 测试环境redis地址
+    REDIS_INFO = {
+        'host': 'r-bp1ps6my7lzg8rdhwx682.redis.rds.aliyuncs.com',
+        'port': 6379,
+        'password': 'Wqsd@2019',
+    }
+
+    # Hologres连接参数,服务器使用
+    HOLOGRES_INFO = {
+        'host': 'hgprecn-cn-7pp28y18c00c-cn-hangzhou-vpc.hologres.aliyuncs.com',
+        'port': 80,
+        'dbname': 'dssm',
+        'user': 'LTAI5tMPqPy9yboQAf1mBCCN',
+        'password': '4BEcOgxREOPq7t3A7EWkjciVULeQGj'
+    }
+
+    # 测试环境mysql地址
+    MYSQL_INFO = {
+        'host': 'rm-bp1k5853td1r25g3n690.mysql.rds.aliyuncs.com',
+        'port': 3306,
+        'user': 'wx2016_longvideo',
+        'password': 'wx2016_longvideoP@assword1234',
+        'db': 'longvideo',
+        'charset': 'utf8'
+    }
+
+    # 测试环境 过滤用mysql地址
+    FILTER_MYSQL_INFO = {
+        'host': 'am-bp1g3ys9u00u483uc131930.ads.aliyuncs.com',
+        'port': 3306,
+        'user': 'lv_manager',
+        'password': 'lv_manager@2020',
+        'db': 'longvideo',
+        'charset': 'utf8'
+    }
+
+    # 日志服务配置
+    ALIYUN_LOG = {
+        'ENDPOINT': 'cn-hangzhou.log.aliyuncs.com',
+        'ACCESSID': 'LTAIWYUujJAm7CbH',
+        'ACCESSKEY': 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P',
+        'PROJECT': 'rov-server-test',
+    }
+
+    # Hologres视频状态存储表名
+    VIDEO_STATUS = 'longvideo_test.dwd_mdm_item_video_stat'
+
+    # 快速曝光流量池ID
+    QUICK_FLOW_POOL_ID = 3
+
+    # 获取流量池分发配置接口地址
+    GET_FLOW_POOL_RECOMMEND_CONFIG_URL = 'http://videotest-internal.yishihui.com/longvideoapi/openapi/recommend/getConfig'
+    # 从流量池获取视频接口地址
+    GET_VIDEOS_FROM_POOL_URL = 'http://testapi-internal.piaoquantv.com/flowpool/video/getAllVideo'
+    # 获取视频在流量池中的剩余可分发数接口地址
+    GET_REMAIN_VIEW_COUNT_URL = 'http://testapi-internal.piaoquantv.com/flowpool/video/remainViewCount'
+    # 计算完ROV通知后端接口地址
+    NOTIFY_BACKEND_UPDATE_ROV_SCORE_URL = 'http://videotest-internal.yishihui.com/longvideoapi/openapi/recommend/updateRovScore'
+    # 获取置顶视频列表接口地址
+    TOP_VIDEO_LIST_URL = 'http://videotest-internal.yishihui.com/longvideoapi/openapi/recommend/topVideoList'
+    # 获取首页兜底视频json接口地址
+    BOTTOM_JSON_URL = 'http://videotest-internal.yishihui.com/longvideoapi/openapi/video/distribute/structure/video/list'
+    # 通知后端更新兜底视频接口地址
+    NOTIFY_BACKEND_updateFallBackVideoList_URL = 'http://videotest-internal.yishihui.com/longvideoapi/openapi/recommend/updateFallBackVideoList'
+    # 获取限流视频接口地址
+    GET_VIDEO_LIMIT_LIST_URL = 'http://videotest-internal.yishihui.com/longvideoapi/openapi/recommend/getVideoLimitList'
+    # 获取管理后台设置的广告目标uv值接口地址
+    GET_AD_TARGET_UV_URL = 'https://testadmin.piaoquantv.com/manager/ad/algo/threshold/productUvTargetList'
+
+    # # logs 上传oss 目标Bucket指定目录
+    # OSS_FOLDER_LOGS = 'rov-offline/test/logs/'
+    # # data 上传oss 目标Bucket指定目录
+    # OSS_FOLDER_DATA = 'rov-offline/test/data/'
+
+class ProductionConfig(BaseConfig):
+    """生产环境配置"""
+    # 报警内容 环境区分
+    ENV_TEXT = "生产环境"
+    # 项目存放目录
+    PROJECT_PATH = '/data/rov-offline'
+
+    # 线上环境redis地址
+    REDIS_INFO = {
+        'host': 'r-bp1fogs2mflr1ybfot.redis.rds.aliyuncs.com',
+        'port': 6379,
+        'password': 'Wqsd@2019',
+    }
+
+    # Hologres连接参数,服务器使用
+    HOLOGRES_INFO = {
+        'host': 'hgprecn-cn-7pp28y18c00c-cn-hangzhou-vpc.hologres.aliyuncs.com',
+        'port': 80,
+        'dbname': 'dssm',
+        'user': 'LTAI5tMPqPy9yboQAf1mBCCN',
+        'password': '4BEcOgxREOPq7t3A7EWkjciVULeQGj'
+    }
+
+    # 生产环境mysql地址
+    MYSQL_INFO = {
+        'host': 'rr-bp1x9785e8h5452bi157.mysql.rds.aliyuncs.com',
+        'port': 3306,
+        'user': 'wx2016_longvideo',
+        'password': 'wx2016_longvideoP@assword1234',
+        'db': 'longvideo',
+        'charset': 'utf8'
+    }
+
+    # 生产环境 过滤用mysql地址
+    FILTER_MYSQL_INFO = {
+        'host': 'am-bp15tqt957i3b3sgi131950.ads.aliyuncs.com',
+        'port': 3306,
+        'user': 'lv_manager',
+        'password': 'lv_manager@2020',
+        'db': 'longvideo',
+        'charset': 'utf8'
+    }
+
+    # 日志服务配置
+    ALIYUN_LOG = {
+        'ENDPOINT': 'cn-hangzhou.log.aliyuncs.com',
+        'ACCESSID': 'LTAIWYUujJAm7CbH',
+        'ACCESSKEY': 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P',
+        'PROJECT': 'rov-server',
+    }
+
+    # Hologres视频状态存储表名
+    VIDEO_STATUS = 'longvideo.dwd_mdm_item_video_stat'
+
+    # 快速曝光流量池ID
+    QUICK_FLOW_POOL_ID = 3
+
+    # 获取流量池分发配置接口地址
+    GET_FLOW_POOL_RECOMMEND_CONFIG_URL = 'http://recommend-common-internal.piaoquantv.com/longvideoapi/openapi/recommend/getConfig'
+    # 从流量池获取视频接口地址
+    GET_VIDEOS_FROM_POOL_URL = 'http://api-internal.piaoquantv.com/flowpool/video/getAllVideo'
+    # 获取视频在流量池中的剩余可分发数接口地址
+    GET_REMAIN_VIEW_COUNT_URL = 'http://api-internal.piaoquantv.com/flowpool/video/remainViewCount'
+    # 计算完ROV通知后端接口地址
+    NOTIFY_BACKEND_UPDATE_ROV_SCORE_URL = 'http://recommend-common-internal.piaoquantv.com/longvideoapi/openapi/recommend/updateRovScore'
+    # 获取置顶视频列表接口地址
+    TOP_VIDEO_LIST_URL = 'http://recommend-common-internal.piaoquantv.com/longvideoapi/openapi/recommend/topVideoList'
+    # 获取首页兜底视频json接口地址
+    BOTTOM_JSON_URL = 'http://recommend-common-internal.piaoquantv.com/longvideoapi/openapi/video/distribute/structure/video/list'
+    # 通知后端更新兜底视频接口地址
+    NOTIFY_BACKEND_updateFallBackVideoList_URL = 'http://recommend-common-internal.piaoquantv.com/longvideoapi/openapi/recommend/updateFallBackVideoList'
+    # 获取限流视频接口地址
+    GET_VIDEO_LIMIT_LIST_URL = 'http://recommend-common-internal.piaoquantv.com/longvideoapi/openapi/recommend/getVideoLimitList'
+    # 获取管理后台设置的广告目标uv值接口地址
+    GET_AD_TARGET_UV_URL = 'https://admin.piaoquantv.com/manager/ad/algo/threshold/productUvTargetList'
+
+    # # logs 上传oss 目标Bucket指定目录
+    # OSS_FOLDER_LOGS = 'rov-offline/pro/logs/'
+    # # data 上传oss 目标Bucket指定目录
+    # OSS_FOLDER_DATA = 'rov-offline/pro/data/'
+
+def set_config():
+    # 获取环境变量 ROV_OFFLINE_ENV
+    env = os.environ.get('Base_ENV')
+    return ProductionConfig()
+
+

+ 47 - 0
cut_title.py

@@ -0,0 +1,47 @@
+#coding utf-8
+import sys
+import jieba 
+import os
+
+if __name__=="__main__":
+    #f1 = open(sys.argv[1])
+    stop_words = set('')
+    path = sys.argv[1]
+    files_dir = os.listdir(path)
+    #print(files_dir)
+    for file_name in files_dir:
+        if file_name.find('.txt')>-1:
+            f1 = open(path+"/"+file_name)
+            while True:
+                file_line = f1.readline()
+                if not file_line:
+                    break
+                file_line = file_line.strip()
+                stop_words.add(file_line)
+            f1.close()
+    #print(len(stop_words))
+    f = open(sys.argv[2])
+    f3 = open(sys.argv[3], 'w')
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        line = line.strip()
+        items = line.split("\t")
+        if len(items)<2:
+            continue
+        vid = items[0]
+        title = items[1] 
+        cut_info =  jieba.lcut(title)
+        cut_arr = []
+        for cut_item in cut_info:
+            #print("cut_item:", cut_item)
+            if cut_item==' ':
+                continue
+            if cut_item in stop_words:
+                continue
+            cut_arr.append(cut_item)
+        vid_info = vid+'\t'+" ".join(cut_arr)
+        f3.write(vid_info.strip()+"\n")
+    f3.close()
+       

+ 47 - 0
cut_title_top.py

@@ -0,0 +1,47 @@
+#coding utf-8
+import sys
+import jieba 
+import os
+
+if __name__=="__main__":
+    #f1 = open(sys.argv[1])
+    stop_words = set('')
+    path = sys.argv[1]
+    files_dir = os.listdir(path)
+    #print(files_dir)
+    for file_name in files_dir:
+        if file_name.find('.txt')>-1:
+            f1 = open(path+"/"+file_name)
+            while True:
+                file_line = f1.readline()
+                if not file_line:
+                    break
+                file_line = file_line.strip()
+                stop_words.add(file_line)
+            f1.close()
+    #print(len(stop_words))
+    f = open(sys.argv[2])
+    f3 = open(sys.argv[3], 'w')
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        line = line.strip()
+        items = line.split("\t")
+        if len(items)<2:
+            continue
+        vid = items[0]
+        title = items[1] 
+        cut_info =  jieba.lcut(title)
+        cut_arr = []
+        for cut_item in cut_info:
+            #print("cut_item:", cut_item)
+            if cut_item==' ':
+                continue
+            if cut_item in stop_words:
+                continue
+            cut_arr.append(cut_item)
+        vid_info = vid+'\t'+" ".join(cut_arr)
+        f3.write(vid_info.strip()+"\n")
+    f3.close()
+       

+ 337 - 0
db_help.py

@@ -0,0 +1,337 @@
+# coding:utf-8
+import redis
+from config import set_config
+config_  = set_config()
+conn_redis = None
+import pymysql
+
+class RedisHelper(object):
+    def __init__(self):
+        """
+        初始化redis连接信息
+        redis_info: redis连接信息, 格式:dict, {'host': '', 'port': '', 'password': ''}
+        """
+        redis_info = config_.REDIS_INFO
+        self.host = redis_info['host']
+        self.port = redis_info['port']
+        self.password = redis_info['password']
+
+    def connect(self):
+        """
+        连接redis
+        :return: conn
+        """
+        global conn_redis
+        if conn_redis is None:
+            pool = redis.ConnectionPool(host=self.host,
+                                        port=self.port,
+                                        password=self.password,
+                                        decode_responses=True)
+            conn = redis.Redis(connection_pool=pool)
+            conn_redis = conn
+        return conn_redis
+
+    def key_exists(self, key_name):
+        """
+        判断key是否存在
+        :param key_name: key
+        :return: 存在-True, 不存在-False
+        """
+        conn = self.connect()
+        return conn.exists(key_name)
+
+    def del_keys(self, key_name):
+        """
+        删除key
+        :param key_name: key
+        :return: None
+        """
+        conn = self.connect()
+        conn.delete(key_name)
+
+    def get_data_from_redis(self, key_name):
+        """
+        读取redis中的数据
+        :param key_name: key
+        :return: data
+        """
+        conn = self.connect()
+        if not conn.exists(key_name):
+            # key不存在
+            return None
+        data = conn.get(key_name)
+        return data
+
+    def set_data_to_redis(self, key_name, value, expire_time=24*3600):
+        """
+        新增数据
+        :param key_name: key
+        :param value: 元素的值 videoId
+        :param expire_time: 过期时间,单位:s,默认1天
+        :return: None
+        """
+        conn = self.connect()
+        conn.set(key_name, value, ex=int(expire_time))
+
+    def add_data_with_zset(self, key_name, data, expire_time=7*24*3600):
+        """
+        新增数据,有序set
+        :param key_name: key
+        :param data: 元素的值及对应分数 type-dict  {value: score}
+        :param expire_time: 过期时间,单位:s,默认7天
+        :return: None
+        """
+        if not data:
+            return
+        conn = self.connect()
+        # 数据量大时一次性写入耗时长,分批次写入
+        keys_list = list(data.keys())
+        zadd_data = {}
+        for i, key in enumerate(keys_list):
+            if i % 100 == 0:
+                if zadd_data:
+                    conn.zadd(key_name, zadd_data)
+                zadd_data = {key: data.get(key)}
+            else:
+                zadd_data[key] = data.get(key)
+        if zadd_data:
+            conn.zadd(key_name, zadd_data)
+        # 设置过期时间
+        conn.expire(key_name, int(expire_time))
+
+    def get_data_zset_with_index(self, key_name, start, end, desc=True, with_scores=False):
+        """
+        根据索引位置获取元素的值
+        :param key_name: key
+        :param start: 索引起始点 闭区间,包含start
+        :param end: 索引结束点 闭区间,包含end
+        :param desc: 分数排序方式,默认从大到小
+        :param with_scores: 是否获取元素的分数,默认 False,只获取元素的值
+        :return: data 元素值列表(不包含分数),value(videoId)类型转换为int, 包含分数时不进行类型转换
+        """
+        conn = self.connect()
+        if not conn.exists(key_name):
+            return None
+        data = conn.zrange(key_name, start, end, desc, with_scores)
+        return data
+        # if with_scores:
+        #     return data
+        # else:
+        #     return [eval(value) for value in data]
+
+    def get_all_data_from_zset(self, key_name, desc=True, with_scores=False):
+        """
+        获取zset中所有元素的值
+        :param key_name: key
+        :param desc: 分数排序方式,默认从大到小
+        :param with_scores: 是否获取元素的分数,默认 False,只获取元素的值
+        :return: data 元素值列表(不包含分数),value(videoId)类型转换为int, 包含分数时不进行类型转换
+        """
+        conn = self.connect()
+        if not conn.exists(key_name):
+            return None
+        data = []
+        start = 0
+        step = 100
+        while True:
+            end = start + step - 1
+            temp = conn.zrange(key_name, start, end, desc, with_scores)
+            if not temp:
+                break
+            data.extend(temp)
+            start += step
+        return data
+
+    def get_score_with_value(self, key_name, value):
+        """
+        在zset中,根据元素的value获取对应的score
+        :param key_name: key
+        :param value: 元素的值
+        :return: score value对应的score
+        """
+        conn = self.connect()
+        return conn.zscore(key_name, value)
+
+    def update_score_with_value(self, key_name, value, score, expire_time=7*24*3600):
+        """
+        在zset中,修改元素value对应的score
+        :param key_name: key
+        :param value: 元素的值
+        :param score: value对应的score更新值
+        :param expire_time: 过期时间,单位:s,默认7天
+        """
+        conn = self.connect()
+        if conn.exists(key_name):
+            conn.zadd(key_name, {value: score})
+        else:
+            # key不存在时,需设置过期时间
+            conn.zadd(key_name, {value: score})
+            conn.expire(key_name, expire_time)
+
+    def remove_value_from_zset(self, key_name, value):
+        """
+        删除zset中的指定元素
+        :param key_name: key
+        :param value: 元素的值
+        :return: None
+        """
+        conn = self.connect()
+        conn.zrem(key_name, *value)
+
+    def remove_by_rank_from_zset(self, key_name, start, stop):
+        """
+        移除有序集中,指定排名(rank)区间内的所有成员
+        :param key_name: key
+        :param start: 开始位
+        :param stop: 结束位
+        :return: None
+        """
+        conn = self.connect()
+        conn.zremrangebyrank(name=key_name, min=start, max=stop)
+
+    def get_index_with_data(self, key_name, value):
+        """
+        根据元素的值获取在有序set中的位置,按照分数倒序(从大到小)
+        :param key_name: key
+        :param value: 元素的值
+        :return: idx 位置索引
+        """
+        conn = self.connect()
+        return conn.zrevrank(key_name, value)
+
+    def get_data_from_set(self, key_name):
+        """
+        获取set中的所有数据
+        :param key_name: key
+        :return: data
+        """
+        conn = self.connect()
+        if not conn.exists(key_name):
+            # key不存在
+            return None
+        data = []
+        cursor = 0
+        while True:
+            cur, temp = conn.sscan(key_name, cursor=cursor, count=2000)
+            data.extend(temp)
+            if cur == 0:
+                break
+            cursor = cur
+        return list(set(data))
+
+    def add_data_with_set(self, key_name, values, expire_time=30*60):
+        """
+        新增数据,set
+        :param key_name: key
+        :param values: 要添加的元素  类型-set
+        :param expire_time: 过期时间,单位:s,默认0.5小时
+        :return: None
+        """
+        conn = self.connect()
+        conn.sadd(key_name, *values)
+        # 设置过期时间
+        conn.expire(key_name, expire_time)
+
+    def data_exists_with_set(self, key_name, value):
+        """
+        判断元素value是否在集合key_name中
+        :param key_name: key
+        :param value: 需判断的元素
+        :return: 存在-True, 不存在-False
+        """
+        conn = self.connect()
+        return conn.sismember(key_name, value)
+
+    def remove_value_from_set(self, key_name, values):
+        """
+        删除set中的指定元素
+        :param key_name: key
+        :param values: 元素的值, 类型-set
+        :return: None
+        """
+        conn = self.connect()
+        conn.srem(key_name, *values)
+
+    def persist_key(self, key_name):
+        """
+        移除key的过期时间,将其转换为永久状态
+        :param key_name: key
+        :return:
+        """
+        conn = self.connect()
+        conn.persist(key_name)
+
+    def setnx_key(self, key_name, value, expire_time=5*60):
+        """
+        当key不存在时,将value塞入key中,key存在时不做操作
+        :param key_name: key
+        :param value: value
+        :return: 过期时间,单位:s,默认5分钟 type-int
+        """
+        conn = self.connect()
+        conn.setnx(name=key_name, value=value)
+        conn.expire(name=key_name, time=int(expire_time))
+
+    def update_expire_time(self, key_name, expire_time):
+        """
+        修改过期时间
+        :param key_name: key
+        :param expire_time: 过期时间
+        :return:
+        """
+        conn = self.connect()
+        conn.expire(name=key_name, time=int(expire_time))
+
+    def update_batch_setnx_key(self, data, expire_time=5*60):
+        conn = self.connect()
+        for key_name, v in data.items():
+            conn.set(name=key_name, value=v)
+            conn.expire(name=key_name, time=int(expire_time))
+        #self.disconnet()
+ 
+    #def disconnet(self):
+    #    conn_redis.disconnect()        
+class MysqlHelper(object):
+    def __init__(self, mysql_info):
+        """
+        初始化mysql连接信息
+        """
+        self.mysql_info = mysql_info
+
+    def get_data(self, sql):
+        """
+        查询数据
+        :param sql: sql语句
+        :return: data
+        """
+        # 连接数据库
+        conn = pymysql.connect(**self.mysql_info)
+        # 创建游标
+        cursor = conn.cursor()
+        try:
+            # 执行SQL语句
+            cursor.execute(sql)
+            # 获取查询的所有记录
+            data = cursor.fetchall()
+        except Exception as e:
+            return None
+        # 关闭游标对象
+        cursor.close()
+        # 关闭数据库连接
+        conn.close()
+        return data
+
+
+if __name__ == '__main__':
+    redis_helper = RedisHelper()
+    # key = 'com.weiqu.video.hot.recommend.item.score.20210901'
+    # res = redis_helper.get_score_with_value(key, 90797)
+    # print(res)
+    # redis_helper.remove_value_from_set(key_name=config_.RELEVANT_TOP_VIDEOS_KEY_NAME, values=(8633849,))
+    con = redis_helper.connect()
+    res = redis_helper.key_exists(key_name='eeew')
+    print(res)
+    redis_helper.setnx_key('eeew','1')
+    res = redis_helper.key_exists(key_name='eeew')
+    print(res)
+

+ 53 - 0
export_3_day.py

@@ -0,0 +1,53 @@
+#coding utf-8
+from odps import ODPS
+from config import set_config
+import datetime
+import pandas as pd
+from collections import defaultdict
+import sys
+
+config_ = set_config()
+
+odps = ODPS(
+        access_id=config_.ODPS_CONFIG['ACCESSID'],
+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
+        project="loghubods",
+        endpoint=config_.ODPS_CONFIG['ENDPOINT'])
+
+
+def get_data_from_odps(date, project, table, connect_timeout=3000, read_timeout=500000,
+                       pool_maxsize=1000, pool_connections=1000):
+    """
+    从odps获取数据
+    :param date: 日期 type-string '%Y%m%d'
+    :param project: type-string
+    :param table: 表名 type-string
+    :param connect_timeout: 连接超时设置
+    :param read_timeout: 读取超时设置
+    :param pool_maxsize:
+    :param pool_connections:
+    :return: records
+    """
+    records = odps.read_table(name=table, partition='dt=%s' % date)
+    return records
+
+def exe_sql(sql):    
+    data = []
+    with odps.execute_sql(sql).open_reader() as reader:
+        d = defaultdict(list)  # collection默认一个dict
+        for record in reader:
+            for res in record:
+                d[res[0]].append(res[1])  # 解析record中的每一个元组,存储方式为(k,v),以k作为key,存储每一列的内容;
+        data = pd.DataFrame.from_dict(d, orient='index').T  # 转换为数据框,并转置,不转置的话是横条数据
+    return data
+
+
+if __name__=="__main__":
+    project = 'loghubods'
+    now_date=sys.argv[1]
+    print("now date:", now_date)
+    table = 'loghubods.video_data_each_hour_dataset_3days_total_apptype'
+    sql = "select apptype, videoid, preview_users, preview_times, view_users, view_times, play_users, play_times, share_users, share_times, return_users from loghubods.video_data_each_hour_dataset_3days_total_apptype where dt="+now_date
+    print(sql)
+    data = exe_sql(sql)
+    data.to_csv("./data/3_days_video_data_"+now_date, sep='\t', index=None) 

+ 53 - 0
export_7_day.py

@@ -0,0 +1,53 @@
+#coding utf-8
+from odps import ODPS
+from config import set_config
+import datetime
+import pandas as pd
+from collections import defaultdict
+import sys
+
+config_ = set_config()
+
+odps = ODPS(
+        access_id=config_.ODPS_CONFIG['ACCESSID'],
+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
+        project="loghubods",
+        endpoint=config_.ODPS_CONFIG['ENDPOINT'])
+
+
+def get_data_from_odps(date, project, table, connect_timeout=3000, read_timeout=500000,
+                       pool_maxsize=1000, pool_connections=1000):
+    """
+    从odps获取数据
+    :param date: 日期 type-string '%Y%m%d'
+    :param project: type-string
+    :param table: 表名 type-string
+    :param connect_timeout: 连接超时设置
+    :param read_timeout: 读取超时设置
+    :param pool_maxsize:
+    :param pool_connections:
+    :return: records
+    """
+    records = odps.read_table(name=table, partition='dt=%s' % date)
+    return records
+
+def exe_sql(sql):    
+    data = []
+    with odps.execute_sql(sql).open_reader() as reader:
+        d = defaultdict(list)  # collection默认一个dict
+        for record in reader:
+            for res in record:
+                d[res[0]].append(res[1])  # 解析record中的每一个元组,存储方式为(k,v),以k作为key,存储每一列的内容;
+        data = pd.DataFrame.from_dict(d, orient='index').T  # 转换为数据框,并转置,不转置的话是横条数据
+    return data
+
+
+if __name__=="__main__":
+    project = 'loghubods'
+    now_date=sys.argv[1]
+    print("now date:", now_date)
+    table = 'loghubods.video_data_each_hour_dataset_7days_total_apptype'
+    sql = "select apptype, videoid, preview_users, preview_times, view_users, view_times, play_users, play_times, share_users, share_times, return_users from loghubods.video_data_each_hour_dataset_7days_total_apptype where dt="+now_date
+    print(sql)
+    data = exe_sql(sql)
+    data.to_csv("./data/7_days_video_data_"+now_date, sep='\t', index=None) 

+ 53 - 0
export_hour_vid.py

@@ -0,0 +1,53 @@
+#coding utf-8
+from odps import ODPS
+from config import set_config
+import datetime
+import pandas as pd
+from collections import defaultdict
+import sys
+
+config_ = set_config()
+
+odps = ODPS(
+        access_id=config_.ODPS_CONFIG['ACCESSID'],
+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
+        project="loghubods",
+        endpoint=config_.ODPS_CONFIG['ENDPOINT'])
+
+
+def get_data_from_odps(date, project, table, connect_timeout=3000, read_timeout=500000,
+                       pool_maxsize=1000, pool_connections=1000):
+    """
+    从odps获取数据
+    :param date: 日期 type-string '%Y%m%d'
+    :param project: type-string
+    :param table: 表名 type-string
+    :param connect_timeout: 连接超时设置
+    :param read_timeout: 读取超时设置
+    :param pool_maxsize:
+    :param pool_connections:
+    :return: records
+    """
+    records = odps.read_table(name=table, partition='dt=%s' % date)
+    return records
+
+def exe_sql(sql):    
+    data = []
+    with odps.execute_sql(sql).open_reader() as reader:
+        d = defaultdict(list)  # collection默认一个dict
+        for record in reader:
+            for res in record:
+                d[res[0]].append(res[1])  # 解析record中的每一个元组,存储方式为(k,v),以k作为key,存储每一列的内容;
+        data = pd.DataFrame.from_dict(d, orient='index').T  # 转换为数据框,并转置,不转置的话是横条数据
+    return data
+
+
+if __name__=="__main__":
+    project = 'loghubods'
+    now_date=sys.argv[1]
+    print("now date:", now_date)
+    table = 'video_data_each_hour_dataset_24h_total_apptype'
+    sql = "select apptype, videoid, lastonehour_view, lastonehour_view_total, lastonehour_play, lastonehour_play_total,lastonehour_share, lastonehour_share_total, lastonehour_return from loghubods.video_each_hour_update_province_apptype where dt="+now_date
+    print(sql)
+    data = exe_sql(sql)
+    data.to_csv("./data/hour_video_data_"+now_date, sep='\t', index=None) 

+ 53 - 0
export_vid.py

@@ -0,0 +1,53 @@
+#coding utf-8
+from odps import ODPS
+from config import set_config
+import datetime
+import pandas as pd
+from collections import defaultdict
+import sys
+
+config_ = set_config()
+
+odps = ODPS(
+        access_id=config_.ODPS_CONFIG['ACCESSID'],
+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
+        project="loghubods",
+        endpoint=config_.ODPS_CONFIG['ENDPOINT'])
+
+
+def get_data_from_odps(date, project, table, connect_timeout=3000, read_timeout=500000,
+                       pool_maxsize=1000, pool_connections=1000):
+    """
+    从odps获取数据
+    :param date: 日期 type-string '%Y%m%d'
+    :param project: type-string
+    :param table: 表名 type-string
+    :param connect_timeout: 连接超时设置
+    :param read_timeout: 读取超时设置
+    :param pool_maxsize:
+    :param pool_connections:
+    :return: records
+    """
+    records = odps.read_table(name=table, partition='dt=%s' % date)
+    return records
+
+def exe_sql(sql):    
+    data = []
+    with odps.execute_sql(sql).open_reader() as reader:
+        d = defaultdict(list)  # collection默认一个dict
+        for record in reader:
+            for res in record:
+                d[res[0]].append(res[1])  # 解析record中的每一个元组,存储方式为(k,v),以k作为key,存储每一列的内容;
+        data = pd.DataFrame.from_dict(d, orient='index').T  # 转换为数据框,并转置,不转置的话是横条数据
+    return data
+
+
+if __name__=="__main__":
+    project = 'loghubods'
+    now_date=sys.argv[1]
+    print("now date:", now_date)
+    table = 'video_data_each_hour_dataset_24h_total_apptype'
+    sql = "select apptype, videoid, preview人数, preview次数, view人数, view次数, play人数, play次数, share人数, share次数, 回流人数 from loghubods.video_data_each_hour_dataset_24h_total_apptype where dt="+now_date
+    print(sql)
+    data = exe_sql(sql)
+    data.to_csv("./data/video_data_"+now_date, sep='\t', index=None) 

+ 54 - 0
extract_cur_share_log.py

@@ -0,0 +1,54 @@
+#coding utf-8
+from odps import ODPS
+from config import set_config
+import datetime
+import pandas as pd
+from collections import defaultdict
+import sys
+
+config_ = set_config()
+
+odps = ODPS(
+        access_id=config_.ODPS_CONFIG['ACCESSID'],
+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
+        project="loghubods",
+        endpoint=config_.ODPS_CONFIG['ENDPOINT'])
+
+
+def get_data_from_odps(date, project, table, connect_timeout=3000, read_timeout=500000,
+                       pool_maxsize=1000, pool_connections=1000):
+    """
+    从odps获取数据
+    :param date: 日期 type-string '%Y%m%d'
+    :param project: type-string
+    :param table: 表名 type-string
+    :param connect_timeout: 连接超时设置
+    :param read_timeout: 读取超时设置
+    :param pool_maxsize:
+    :param pool_connections:
+    :return: records
+    """
+    records = odps.read_table(name=table, partition='dt=%s' % date)
+    return records
+
+def exe_sql(sql):    
+    data = []
+    with odps.execute_sql(sql).open_reader() as reader:
+        d = defaultdict(list)  # collection默认一个dict
+        for record in reader:
+            for res in record:
+                d[res[0]].append(res[1])  # 解析record中的每一个元组,存储方式为(k,v),以k作为key,存储每一列的内容;
+        data = pd.DataFrame.from_dict(d, orient='index').T  # 转换为数据框,并转置,不转置的话是横条数据
+    return data
+
+
+if __name__=="__main__":
+    project = 'loghubods'
+    last7day=sys.argv[1]
+    now_date=sys.argv[2]
+    print("now date:", now_date)
+    table = 'user_share_log'
+    sql = "select machinecode, shareobjectid from loghubods.user_share_log_per5min where dt between '"+last7day+"' and '"+now_date+"' and topic='share';"
+    print(sql)
+    data = exe_sql(sql)
+    data.to_csv("./data/user_cur_day_item_share_"+now_date, sep='\t') 

+ 54 - 0
extract_share_log.py

@@ -0,0 +1,54 @@
+#coding utf-8
+from odps import ODPS
+from config import set_config
+import datetime
+import pandas as pd
+from collections import defaultdict
+import sys
+
+config_ = set_config()
+
+odps = ODPS(
+        access_id=config_.ODPS_CONFIG['ACCESSID'],
+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
+        project="loghubods",
+        endpoint=config_.ODPS_CONFIG['ENDPOINT'])
+
+
+def get_data_from_odps(date, project, table, connect_timeout=3000, read_timeout=500000,
+                       pool_maxsize=1000, pool_connections=1000):
+    """
+    从odps获取数据
+    :param date: 日期 type-string '%Y%m%d'
+    :param project: type-string
+    :param table: 表名 type-string
+    :param connect_timeout: 连接超时设置
+    :param read_timeout: 读取超时设置
+    :param pool_maxsize:
+    :param pool_connections:
+    :return: records
+    """
+    records = odps.read_table(name=table, partition='dt=%s' % date)
+    return records
+
+def exe_sql(sql):    
+    data = []
+    with odps.execute_sql(sql).open_reader() as reader:
+        d = defaultdict(list)  # collection默认一个dict
+        for record in reader:
+            for res in record:
+                d[res[0]].append(res[1])  # 解析record中的每一个元组,存储方式为(k,v),以k作为key,存储每一列的内容;
+        data = pd.DataFrame.from_dict(d, orient='index').T  # 转换为数据框,并转置,不转置的话是横条数据
+    return data
+
+
+if __name__=="__main__":
+    project = 'loghubods'
+    last7day=sys.argv[1]
+    now_date=sys.argv[2]
+    print("now date:", now_date)
+    table = 'user_share_log'
+    sql = "select machinecode, shareobjectid from loghubods.user_share_log where dt between '"+last7day+"' and '"+now_date+"' and topic='share';"
+    print(sql)
+    data = exe_sql(sql)
+    data.to_csv("./data/user_item_share_"+now_date, sep='\t') 

+ 78 - 0
extract_title_tag.py

@@ -0,0 +1,78 @@
+#coding utf-8
+import sys
+import jieba
+from jieba import analyse
+import jieba.posseg as pseg
+import re
+ 
+import os
+
+if __name__=="__main__":
+    #f1 = open(sys.argv[1])
+    stop_words = set('')
+    '''path = sys.argv[1]
+    files_dir = os.listdir(path)
+    #print(files_dir)
+    for file_name in files_dir:
+        if file_name.find('.txt')>-1:
+            f1 = open(path+"/"+file_name)
+            while True:
+                file_line = f1.readline()
+                if not file_line:
+                    break
+                file_line = file_line.strip()
+                stop_words.add(file_line)
+            f1.close()
+    #print(len(stop_words))'''
+    analyse.set_stop_words("all_stopword.txt")
+    f = open(sys.argv[1])
+    f3 = open(sys.argv[2], 'w')
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        line = line.strip()
+        items = line.split("\t")
+        if len(items)<2:
+            continue
+        vid = items[0]
+        title = items[1] 
+        #cut_info =  pseg.cut(title)
+        #tfif_top =jieba.analyse.extract_tags(title,topK=3, withWeight=True, allowPOS=("nr","ns","n","nt","nw","nz","vn","v","a", "d", "f", "s","t", "PER", "LOC", "ORG"))
+        tfif_top =jieba.analyse.extract_tags(title,topK=4, withWeight=True)
+        #text_rank_top =jieba.analyse.textrank(title,topK=3,withWeight=True, allowPOS=("nr","ns","n","nt","nw","nz","vn","v","a", "d", "f", "s","t", "PER", "LOC", "ORG"))
+        #print(title)
+        #print(tfif_top)
+        #print(text_rank_top)
+        tags = []
+        for word in tfif_top:
+            #pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$')
+            #result = pattern.match(word)
+            #if result:
+            #    continue
+            if word[0].isdigit():
+                continue
+            try:
+                vid = float(word[0])
+                continue
+            except:
+                tags.append(str(word[0]))
+                #print('%s %s' % (word[0], word[1]))
+            #print('%s %s' % (word[0], word[1]))
+        if len(tags)>0:
+            #print(tags)
+            vid_info=str(vid)+"\t"+",".join(tags)
+            f3.write(vid_info.strip()+"\n")
+            #print("--------------")
+        '''cut_arr = []
+        for cut_item in cut_info:
+            #print("cut_item:", cut_item)
+            if cut_item==' ':
+                continue
+            if cut_item in stop_words:
+                continue
+            cut_arr.append(cut_item)'''
+        #vid_info = vid+'\t'+" ".join(cut_arr)
+        #f3.write(vid_info.strip()+"\n")
+    f3.close()
+       

+ 54 - 0
extract_user_action.py

@@ -0,0 +1,54 @@
+#coding utf-8
+from odps import ODPS
+from config import set_config
+import datetime
+import pandas as pd
+from collections import defaultdict
+import sys
+
+config_ = set_config()
+
+odps = ODPS(
+        access_id=config_.ODPS_CONFIG['ACCESSID'],
+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
+        project="loghubods",
+        endpoint=config_.ODPS_CONFIG['ENDPOINT'])
+
+
+def get_data_from_odps(date, project, table, connect_timeout=3000, read_timeout=500000,
+                       pool_maxsize=1000, pool_connections=1000):
+    """
+    从odps获取数据
+    :param date: 日期 type-string '%Y%m%d'
+    :param project: type-string
+    :param table: 表名 type-string
+    :param connect_timeout: 连接超时设置
+    :param read_timeout: 读取超时设置
+    :param pool_maxsize:
+    :param pool_connections:
+    :return: records
+    """
+    records = odps.read_table(name=table, partition='dt=%s' % date)
+    return records
+
+def exe_sql(sql):    
+    data = []
+    with odps.execute_sql(sql).open_reader() as reader:
+        d = defaultdict(list)  # collection默认一个dict
+        for record in reader:
+            for res in record:
+                d[res[0]].append(res[1])  # 解析record中的每一个元组,存储方式为(k,v),以k作为key,存储每一列的内容;
+        data = pd.DataFrame.from_dict(d, orient='index').T  # 转换为数据框,并转置,不转置的话是横条数据
+    return data
+
+
+if __name__=="__main__":
+    project = 'loghubods'
+    last7day=sys.argv[1]
+    now_date=sys.argv[2]
+    print("now date:", now_date)
+    table = 'user_action_log_base'
+    sql = "select  mid, videoid, businesstype, clienttimestamp, return from loghubods.user_action_log_base_addrealplay where dt between '"+last7day+"' and '"+now_date+"' and businesstype in ('videoShareFriend');"
+    print(sql)
+    data = exe_sql(sql)
+    data.to_csv("./data/user_action_"+now_date, sep='\t') 

+ 53 - 0
extract_video_info.py

@@ -0,0 +1,53 @@
+#coding utf-8
+from odps import ODPS
+from config import set_config
+import datetime
+import pandas as pd
+from collections import defaultdict
+import sys
+
+config_ = set_config()
+
+odps = ODPS(
+        access_id=config_.ODPS_CONFIG['ACCESSID'],
+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
+        project="loghubods",
+        endpoint=config_.ODPS_CONFIG['ENDPOINT'])
+
+
+def get_data_from_odps(date, project, table, connect_timeout=3000, read_timeout=500000,
+                       pool_maxsize=1000, pool_connections=1000):
+    """
+    从odps获取数据
+    :param date: 日期 type-string '%Y%m%d'
+    :param project: type-string
+    :param table: 表名 type-string
+    :param connect_timeout: 连接超时设置
+    :param read_timeout: 读取超时设置
+    :param pool_maxsize:
+    :param pool_connections:
+    :return: records
+    """
+    records = odps.read_table(name=table, partition='dt=%s' % date)
+    return records
+
+def exe_sql(sql):    
+    data = []
+    with odps.execute_sql(sql).open_reader() as reader:
+        d = defaultdict(list)  # collection默认一个dict
+        for record in reader:
+            for res in record:
+                d[res[0]].append(res[1])  # 解析record中的每一个元组,存储方式为(k,v),以k作为key,存储每一列的内容;
+        data = pd.DataFrame.from_dict(d, orient='index').T  # 转换为数据框,并转置,不转置的话是横条数据
+    return data
+
+
+if __name__=="__main__":
+    project = 'loghubods'
+    now_date=sys.argv[1]
+    print("now date:", now_date)
+    table = 'video_data_each_hour_dataset_24h_total_apptype'
+    sql = "select id, title, video_path, cover_img_path,self_cover_img_path,play_count, share_count, reported_count, favoriteds, total_time, tag_count,stage_recommend_examine_status, sensitive_status, new_share_image_path from videoods.wx_video_per1h where status=1 and examine_status=1 ";
+    print(sql)
+    data = exe_sql(sql)
+    data.to_csv("./data/video_data_info_"+now_date, sep='\t', index=None) 

+ 46 - 0
filter_video.py

@@ -0,0 +1,46 @@
+#coding utf-8
+import sys
+from utils import  filter_video_status
+
+if __name__=="__main__":
+    f = open(sys.argv[1])
+    video_set = set('')
+    video_rec = []
+    while True:
+        line = f.readline()
+        if not line:
+           break
+        items = line.strip().split("\t")
+        if len(items)<3:
+            continue
+        vid = -1
+        try:
+            vid = int(items[2])
+        except:
+            continue
+        if vid ==-1:
+            continue
+        video_set.add(vid)
+        video_rec.append(line)
+    f.close()
+    print(len(video_set))
+    video_list = list(video_set)
+    left_video_list = filter_video_status(video_list)
+    left_video_set = set(left_video_list)
+    print(left_video_list)
+    f2 = open(sys.argv[2], 'w')
+    for line in video_rec:
+        items  = line.strip().split("\t")
+        if len(items)<3:
+           continue
+        vid = -1
+        try:
+            vid  = int(items[2])
+        except:
+            continue
+        if vid not in left_video_set:
+            continue
+        f2.write(line)  
+    f2.close()
+    #print(len(left_video_list))
+    

+ 95 - 0
get3HotRecall.py

@@ -0,0 +1,95 @@
+#coding utf-8
+import sys
+from operator import itemgetter
+import json
+import pandas as pd
+from db_help import RedisHelper
+
+if __name__=="__main__":
+    #1.load data
+    nowdate=sys.argv[1]
+    vlog='0'
+    love_live = 4
+    data_path = "./data/3_days_video_data_"+nowdate
+    f = open(data_path)
+    #data = pd.read_csv(data_path, encoding="utf-8", sep='\t')
+    #print(data)
+    index = 0
+    data_dict = {}
+    redis_helper = RedisHelper()
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        if index==0:
+            index += 1
+            continue
+        index +=1
+        items = line.strip().split("\t")
+        if len(items)<11:
+            continue
+        vid = int(items[1])
+        view_users = int(items[4])
+        view_pv = int(items[5])
+        play_users = int(items[6])
+        play_pv = int(items[7])
+        share_users = int(items[8])
+        share_pv = int(items[9])
+        return_users = int(items[10])
+        #print(items)
+        if vid not in data_dict:
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+        else:
+            item_info = data_dict[vid]
+            view_users = item_info[0]+view_users
+            view_pv = item_info[1]+view_pv
+            play_users = item_info[2]+play_pv
+            share_users = item_info[3]+share_users
+            share_pv = item_info[4]+share_pv
+            return_users = item_info[5]+return_users 
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+    #print(data_dict.items())
+    info_dict = {}
+    data_path = "./data/3_days_recall_hot_"+nowdate
+    f = open(data_path, 'w')
+    for k, v in data_dict.items():
+        #print(v)
+        return_users = v[6]
+        #print(return_users)
+        view_users = v[0]
+        view_pv = v[1]
+        if view_pv<100 or view_users<10:
+            continue
+        share_pv = v[5]
+        share_users = v[4]
+        play_users = v[2]
+        play_pv = v[3]
+        #print("return_users:", return_users) 
+        k_score = float(return_users)/(float(view_users)+30)
+        #print(k_score)
+        share_score = float(share_pv)/(float(view_pv)+100)
+        backrate = float(return_users)/(float(view_users)+30)
+        ctr_score = float(play_pv)/float(view_pv+100)
+        if ctr_score<=0.5:
+            continue
+        #print(k, k_score, share_score*backrate, share_score, backrate) 
+        score_info = [k_score, share_score*backrate, share_score, backrate, ctr_score, view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users]
+        info_dict[k] = score_info
+    sorted_v = sorted(info_dict.items(), key=lambda s:s[1][1], reverse=True) 
+    print("sorted_v:", sorted_v[:30])
+    recall_name = "hot_3day:"
+    hot_recall_res = []
+    for item in sorted_v[:5]:
+        hot_recall_res.append((item[0], item[1][1]))
+    if len(hot_recall_res)>5:
+        score_info = json.dumps(hot_recall_res)
+        print("score_info:", score_info)
+        redis_helper.set_data_to_redis(recall_name, score_info, 60*60*24*15)
+        f.write(recall_name+"\t"+score_info+"\n")
+    f.close()
+    #info_dict[k] = score_info
+    #f.write(k+"\t"+score_info+"\n")
+    #redis_helper.update_batch_setnx_key(info_dict, 60*60*24*15) 
+    #f.close()
+
+     

+ 94 - 0
get7HotRecall.py

@@ -0,0 +1,94 @@
+#coding utf-8
+import sys
+from operator import itemgetter
+import json
+import pandas as pd
+from db_help import RedisHelper
+
+if __name__=="__main__":
+    #1.load data
+    nowdate=sys.argv[1]
+    vlog='0'
+    love_live = 4
+    data_path = "./data/7_days_video_data_"+nowdate
+    f = open(data_path)
+    #data = pd.read_csv(data_path, encoding="utf-8", sep='\t')
+    #print(data)
+    index = 0
+    data_dict = {}
+    redis_helper = RedisHelper()
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        if index==0:
+            index += 1
+            continue
+        index +=1
+        items = line.strip().split("\t")
+        if len(items)<11:
+            continue
+        vid = int(items[1])
+        view_users = int(items[4])
+        view_pv = int(items[5])
+        play_users = int(items[6])
+        play_pv = int(items[7])
+        share_users = int(items[8])
+        share_pv = int(items[9])
+        return_users = int(items[10])
+        #print(items)
+        if vid not in data_dict:
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+        else:
+            item_info = data_dict[vid]
+            view_users = item_info[0]+view_users
+            view_pv = item_info[1]+view_pv
+            play_users = item_info[2]+play_pv
+            share_users = item_info[3]+share_users
+            share_pv = item_info[4]+share_pv
+            return_users = item_info[5]+return_users 
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+    #print(data_dict.items())
+    info_dict = {}
+    data_path = "./data/7_days_recall_hot_"+nowdate
+    f = open(data_path, 'w')
+    for k, v in data_dict.items():
+        #print(v)
+        return_users = v[6]
+        #print(return_users)
+        view_users = v[0]
+        view_pv = v[1]
+        if view_pv<100 or view_users<10:
+            continue
+        share_pv = v[5]
+        share_users = v[4]
+        play_users = v[2]
+        play_pv = v[3]
+        #print("return_users:", return_users) 
+        k_score = float(return_users)/(float(view_users)+5)
+        #print(k_score)
+        share_score = float(share_pv)/(float(view_pv)+5)
+        backrate = float(return_users)/(float(view_users)+5)
+        ctr_score = float(play_pv)/float(view_pv+5)
+        if ctr_score<=0.5:
+            continue
+        #print(k, k_score, share_score*backrate, share_score, backrate) 
+        score_info = [k_score, share_score*backrate, share_score, backrate, ctr_score, view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users]
+        info_dict[k] = score_info
+    sorted_v = sorted(info_dict.items(), key=lambda s:s[1][1], reverse=True) 
+    print("sorted_v:", sorted_v[:100])
+    recall_name = "hot_7day:"
+    hot_recall_res = []
+    for item in sorted_v[:100]:
+        hot_recall_res.append((item[0], item[1][1]))
+    if len(hot_recall_res)>10:
+        score_info = json.dumps(hot_recall_res)
+        print("score_info:", score_info)
+        redis_helper.set_data_to_redis(recall_name, score_info, 60*60*24*15)
+        f.write(recall_name+"\t"+score_info+"\n") 
+    #info_dict[k] = score_info
+    #f.write(k+"\t"+score_info+"\n")
+    #redis_helper.update_batch_setnx_key(info_dict, 60*60*24*15) 
+    f.close()
+
+     

+ 83 - 0
get_batch_sim_k.py

@@ -0,0 +1,83 @@
+#coding utf-8
+import sys
+import pandas as pd
+import numpy as np
+import faiss
+import time
+
+def gen_i2i(index_item, embeddings,i2i):
+    fw=open(i2i,"w")
+    #print(i2i)
+    start_time = time.time()
+    #xb = embeddings
+    xb=np.array(embeddings).astype('float32')
+    #print(xb)
+    #index.add(xb)
+    dim, measure = 64, faiss.METRIC_L2  
+    param =  'IVF100,PQ16'
+    index = faiss.index_factory(dim, param, measure) 
+    #print(index.is_trained)                          # 此时输出为False,因为倒排索引需要训练k-means, 
+    index.train(xb) 
+    end_time = time.time()
+    print("time:", (end_time-start_time))
+    #index=faiss.IndexFlatL2(100)
+    #index.add(embed_matrix)
+    #the candicate matrix is embed_matrix,but the search matrix is the same.
+    #if the search vector is in the candicate matrix, the return idx>> the first is the search vector itself
+    #if the search vector is not in the candicate matrix, the return idx>>the first is the index of the candicate
+    batch = 10000
+    num = len(embeddings)
+    per_rounds = int(num/batch)+1
+    #index=faiss.IndexFlatL2(64)
+    index.add(xb)
+    print("cost time:", (end_time-start_time))
+    #distence_matrix,recall_list=index.search(xb, 20)
+    #print(distence_matrix)
+    #print(recall_list)
+    for i in range(per_rounds):
+        per_embedding = xb[i:(i+1)*batch]
+        #print(per_embedding)
+        #print(len(per_embedding))
+        distence_matrix,recall_list=index.search(per_embedding, 20)
+        #print("distence_matrix:", distence_matrix)
+        #print("recall_list:", recall_list)
+       	for idx,rec_arr in enumerate(recall_list):
+            #print("idx:", idx)
+            orgin_item=str(index_item[idx])
+            #print("orgin_item:", orgin_item)
+            #print("rec_arr:", rec_arr)
+            recall_str=""
+            for re_id in rec_arr[1:]:
+                if re_id in index_item:
+                    recall_idstr=str(index_item[re_id])
+                    recall_str=recall_str+","+recall_idstr
+            fw.write(orgin_item+"\t"+recall_str[1:]+"\n")
+
+if __name__ == '__main__':
+    f = open(sys.argv[1])
+    index = 0
+    index_dict = {}
+    index_arr = []
+    while True:
+        line = f.readline()
+        if not line:
+           break
+        items = line.strip().split(" ")
+        try:
+            vid = int(items[0])
+            vid_vec = eval(" ".join(items[1:]))
+            vid_vec=np.array(vid_vec)
+            float_arr = vid_vec.astype(np.float64).tolist()
+            #print(float_arr)
+            index_arr.append(float_arr)
+            #index +=1
+            index_dict[index] = vid
+            index +=1
+            #break
+            #print(index_arr)
+        except:
+            #break
+            continue
+    f.close()
+    #print(index_arr)
+    gen_i2i(index_dict, index_arr, "i2i_result")

+ 59 - 0
get_sim_k.py

@@ -0,0 +1,59 @@
+#coding utf-8
+import sys
+import pandas as pd
+import numpy as np
+import faiss
+import time
+
+def gen_i2i(index_item, embeddings,i2i):
+    fw=open(i2i,"w")
+    #print(i2i)
+    embed_matrix=np.array(embeddings).astype('float32')
+    #print(embed_matrix)
+    index=faiss.IndexFlatL2(100)
+    index.add(embed_matrix)
+    #the candicate matrix is embed_matrix,but the search matrix is the same.
+    #if the search vector is in the candicate matrix, the return idx>> the first is the search vector itself
+    #if the search vector is not in the candicate matrix, the return idx>>the first is the index of the candicate
+    distence_matrix,recall_list=index.search(embed_matrix, 20)
+    for idx,rec_arr in enumerate(recall_list):
+        #print("idx:", idx)
+        orgin_item=str(index_item[idx])
+        recall_str=""
+        #rec_arr=[0 6 3 8 7 1]
+        for re_id in rec_arr[1:]:
+            recall_idstr=str(index_item[re_id])
+            #print(recall_idstr)
+            recall_str=recall_str+","+recall_idstr
+        fw.write(orgin_item+"\t"+recall_str[1:]+"\n")
+
+if __name__ == '__main__':
+    f = open(sys.argv[1])
+    index = 0
+    start_time = time.time()
+    index_dict = {}
+    index_arr = []
+    while True:
+        line = f.readline()
+        if not line:
+           break
+        line = line.strip().replace("[","").replace("]","")
+        #print(eval(line))
+        items = line.split(" ")
+        if len(items)<2:
+            continue
+        try:
+           vid = int(items[0])
+           #vid_vec = items[1:]
+           print(line.split(" "))
+           vid_vec = eval(" ".join(items[1:]))
+           index_arr.append(vid_vec)
+           index_dict[index] = vid
+           index +=1
+        except:
+           continue
+    f.close()
+    print(len(index_arr))
+    end_time = time.time()
+    print("time:", (end_time-start_time))
+    #gen_i2i(index_dict, index_arr, "i2i_result")

+ 30 - 0
import_redist.py

@@ -0,0 +1,30 @@
+#coding utf-8
+import sys
+import json
+from db_help import RedisHelper
+
+if __name__=="__main__":
+     f2 = open(sys.argv[2], 'w')
+     with  open(sys.argv[1]) as f:
+         rec_json_list=json.load(f)
+         #print(rec_json_list)
+         import_data_dict = {}
+         for line in rec_json_list:
+             #print(line)
+             rec_list = line[1][:10]
+             rec_item_list = []
+             for rec_item in rec_list:
+                 rec_item_list.append((rec_item[0], round(rec_item[1],3)))
+             res_info = json.dumps(rec_item_list)
+             
+             f2.write(str(line[0])+"\t"+res_info+"\n")
+             key="sim_hot_"+str(line[0])
+             import_data_dict[key] = res_info
+         redis_helper = RedisHelper()
+         redis_helper.update_batch_setnx_key(import_data_dict, 60*60*24*7)
+         #con = redis_helper.connect()
+         res = redis_helper.get_data_from_redis("sim_hot_14330133")
+         print(res)
+         f2.close()
+     f.close()
+   

+ 56 - 0
predict.py

@@ -0,0 +1,56 @@
+#coding utf-8
+import sys
+from gensim import models
+import numpy as np
+
+if __name__=="__main__":
+    #model = models.word2vec.Word2Vec.load('word2vec.txt')
+    #print(model.wx)
+    f1 = open('word2vec.txt')
+    word_dict = {}
+    while True:
+        line = f1.readline()
+        if not line:
+            break
+        items = line.strip().split(" ")
+        if len(items)<64:
+            continue
+        arr = []
+        for w in items[1:]:
+            arr.append(float(w))
+        word_dict[items[0]] = arr
+    #print(word_dict)
+    f  = open(sys.argv[1])
+    num = 0
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        num = num+1
+        if num == 1:
+            continue
+        items = line.split("\t")
+        if len(items)<2:
+            continue
+        vid = items[0]
+        title_arr = items[1].split(" ")
+        title_info = np.zeros(64)
+        word_len = 0
+        for word in title_arr:
+            if word in word_dict:
+                 #print(title_info)
+                 #print(word)
+                 word_vec = word_dict[word]
+                 #print(word_vec)
+                 title_info = np.add(title_info, word_vec)
+                 word_len +=1
+        #print(title_info)
+        title_info_list = []
+        if word_len<=0:
+            continue
+        for j in title_info:
+            title_info_list.append(j/word_len)
+        #print("title_info_list:", title_info_list)
+        print(vid,"\t",title_info_list)
+        
+    

+ 53 - 0
process_video.py

@@ -0,0 +1,53 @@
+#coding utf-8
+import sys
+import pandas as pd
+import numpy as np
+import faiss
+
+
+def gen_i2i(index_item, embeddings,i2i):
+    fw=open(i2i,"w")
+    #print(i2i)
+    embed_matrix=np.array(embeddings).astype('float32')
+    #print(embed_matrix)
+    index=faiss.IndexFlatL2(100)
+    index.add(embed_matrix)
+    #the candicate matrix is embed_matrix,but the search matrix is the same.
+    #if the search vector is in the candicate matrix, the return idx>> the first is the search vector itself
+    #if the search vector is not in the candicate matrix, the return idx>>the first is the index of the candicate
+    distence_matrix,recall_list=index.search(embed_matrix, 20)
+    for idx,rec_arr in enumerate(recall_list):
+        #print("idx:", idx)
+        orgin_item=str(index_item[idx])
+        recall_str=""
+        #rec_arr=[0 6 3 8 7 1]
+        for re_id in rec_arr[1:]:
+            recall_idstr=str(index_item[re_id])
+            #print(recall_idstr)
+            recall_str=recall_str+","+recall_idstr
+        fw.write(orgin_item+"\t"+recall_str[1:]+"\n")
+
+if __name__ == '__main__':
+    f = open(sys.argv[1])
+    index = 0
+    index_dict = {}
+    index_arr = []
+    while True:
+        line = f.readline()
+        if not line:
+           break
+        line = line.strip()
+        #print(line)
+        items = line.split(" ")
+        #print(int(items[0]))
+        try:
+            vid = int(items[0])
+            print(line)
+            #print(str(vid)+"\t"+items[1:])
+            #print(index_arr)
+        except:
+            #print(int(items[0]))
+            continue
+    f.close()
+    #print(len(index_arr))
+    #gen_i2i(index_dict, index_arr, "i2i_result")

+ 68 - 0
run.sh

@@ -0,0 +1,68 @@
+#!/bin/bash
+source ~/.bash_profile
+source ~/.bashrc
+conda activate tf2 
+
+cd /data/SimRecall
+
+#1. download data
+nowday=`date  +"%Y%m%d" -d -0days`
+last7day=`date  +"%Y%m%d" -d -15days`
+echo ${nowday} 
+echo ${last7day}
+mkdir -p ./data/
+mkdir -p ./logs/
+#conda activate py36
+
+nowhour=`date  +"%Y%m%d%H" -d -0days`
+nowstart=$nowday'000000'
+nowhour=${nowhour}'0000'
+#nowhour='20230601140000'
+echo $nowhour
+echo ${nowstart}
+echo ${last7day}
+
+#python extract_cur_share_log.py ${nowstart} ${nowhour}
+#if [ $? -ne 0 ];
+#then
+   # msg = "[ERROR] simrecall extract_share_log"
+   # sh sendmsg.sh  $nowday  $msg
+   # echo "[ERROR] echo 'extract_share_log"
+   # exit 255
+#fi
+
+
+python extract_share_log.py ${last7day} ${nowday}
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] simrecall extract_share_log"
+    #sh sendmsg.sh  $nowday  $msg
+    echo "[ERROR] echo 'extract_share_log"
+    exit 255
+fi
+
+python filter_video.py ./data/user_item_share_${nowday} ./data/user_item_share_filter_${nowday}
+
+
+exit
+#nowday='20230505'
+#2. cal i2i result
+#python calI2I2.py ${nowday}  ${nowhour}
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] simrecall calI2I.py"
+    #sh sendmsg.sh $nowday $msg
+    echo $msg
+    exit -1
+fi
+
+#3.import res
+#python import_redist.py "./data/rec_result3_"${nowhour}".json"  "./data/redis_cls_"${nowhour}".json"
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] simhot recall import_redist.py"
+    sh sendmsg.sh  $nowday  $msg
+    echo $msg
+    exit -1
+fi
+echo 'finish sorted'

+ 95 - 0
run_3day.sh

@@ -0,0 +1,95 @@
+#!/bin/bash
+source ~/.bash_profile
+source ~/.bashrc
+
+#conda activate python36 
+cd /data/rec_project/OffLineRec
+#cd /home/rec/project/git_project/OffLineRec 
+#cd /data/rec_project/OffLineRec
+#1. download data
+nowday=`date  +"%Y%m%d%H" -d -1hours`
+echo ${nowday} 
+#exit
+#nowday='2023052413'
+#3.import res
+mkdir -p ./data/
+
+python export_3_day.py  ${nowday}
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] export_3_day.py"
+    #sh sendmsg.sh  $nowday  $msg
+    echo "[ERROR] export_3_day.py"
+    exit 255
+fi
+#python export_7_day.py ${nowday}
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] export_7_day.py"
+    #sh sendmsg.sh  $nowday  $msg
+    echo "[ERROR] echo export_7_day.py"
+    exit 255
+fi
+python export_hour_vid.py ${nowday}
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] export_7_day.py"
+    sh sendmsg.sh  $nowday  $msg
+    echo "[ERROR] echo export_7_day.py"
+    exit 255
+fi
+
+
+python calHourData.py ${nowday}
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] export_hour_data.py"
+    sh sendmsg.sh  $nowday  $msg
+    echo "[ERROR] echo export_7_day.py"
+    exit 255
+fi
+
+
+python calCtr1days.py ${nowday}
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] export_1_day.py"
+    sh sendmsg.sh  $nowday  $msg
+    echo "[ERROR] echo export_7_day.py"
+    exit 255
+fi
+
+python calCtr3days.py ${nowday}
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] cal3ctr "
+    sh sendmsg.sh  $nowday  $msg
+    echo "[ERROR] echo 'cal3days"
+    exit 255
+fi
+
+#python calCtr7days.py ${nowday}
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] cal 7 day ctr "
+    #sh sendmsg.sh  $nowday  $msg
+    echo "[ERROR] echo 'calCtr.py"
+    exit 255
+fi
+python compose_score_3day.py ${nowday}
+#python get3HotRecall.py ${nowday} 
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] cal  3 hot "
+    #sh sendmsg.sh  $nowday  $msg
+    echo "[ERROR] echo 'calCtr.py"
+    exit 255
+fi
+python get3HotRecall.py ${nowday}
+
+#python get7HotRecall.py ${nowday}
+
+
+echo "finish sorted"
+
+

+ 62 - 0
run_ctr.sh

@@ -0,0 +1,62 @@
+#!/bin/bash
+source ~/.bash_profile
+source ~/.bashrc
+
+#conda activate python36 
+cd /data/rec_project/OffLineRec
+#cd /home/rec/project/git_project/OffLineRec 
+#cd /data/rec_project/OffLineRec
+#1. download data
+nowday=`date  +"%Y%m%d%H" -d -0days`
+echo ${nowday} 
+#3.import res
+mkdir -p ./data/
+
+python export_vid.py  ${nowday}
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] sorted extract_vid_log"
+    sh sendmsg.sh  $nowday  $msg
+    echo "[ERROR] echo 'extract_vid.py"
+    exit 255
+fi
+python export_hour_vid.py ${nowday}
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] sorted extract_hour_log"
+    sh sendmsg.sh  $nowday  $msg
+    echo "[ERROR] echo 'extract_hour_vid.py"
+    exit 255
+fi
+
+
+python calCtr.py ${nowday}
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] cal  ctr "
+    sh sendmsg.sh  $nowday  $msg
+    echo "[ERROR] echo 'calCtr.py"
+    exit 255
+fi
+
+python calHourCtr.py ${nowday}
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] cal hour ctr "
+    sh sendmsg.sh  $nowday  $msg
+    echo "[ERROR] echo 'calCtr.py"
+    exit 255
+fi
+python compose_score2.py ${nowday}
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] cal compose_score "
+    sh sendmsg.sh  $nowday  $msg
+    echo "[ERROR] echo 'compose_score.py"
+    exit 255
+fi
+
+sh clean.sh
+echo "finish sorted"
+
+

+ 32 - 0
run_extract_tag.sh

@@ -0,0 +1,32 @@
+#!/bin/bash
+source ~/.bash_profile
+source ~/.bashrc
+
+conda activate python36 
+
+#1. download data
+nowday=`date  +"%Y%m%d" -d -0days`
+last7day=`date  +"%Y%m%d" -d -1days`
+echo ${nowday} 
+#3.import res
+mkdir -p ./data/
+
+python extract_user_action.py  ${last7day} ${nowday}
+#if [ $? -ne 0 ];
+#then
+#    msg = "[ERROR] sorted extract_vid_log"
+#    sh sendmsg.sh  $nowday  $msg
+#    echo "[ERROR] echo 'extract_vid.py"
+#    exit 255
+#fi
+
+#python extract_video_info.py ${nowday}
+#if [ $? -ne 0 ];
+#then
+#    msg = "[ERROR] cal ctr "
+#    sh sendmsg.sh  $nowday  $msg
+#    echo "[ERROR] echo 'calCtr.py"
+#    exit 255
+#fi
+#echo "finish sorted"
+

+ 65 - 0
run_hour.sh

@@ -0,0 +1,65 @@
+#!/bin/bash
+source ~/.bash_profile
+source ~/.bashrc
+conda activate tf2
+
+cd /data/OffLineRec
+
+#1. download data
+nowday=`date  +"%Y%m%d" -d -0days`
+last7day=`date  +"%Y%m%d" -d -15days`
+echo ${nowday} 
+echo ${last7day}
+mkdir -p ./data/
+mkdir -p ./logs/
+#conda activate py36
+
+nowhour=`date  +"%Y%m%d%H" -d -0days`
+nowstart=$nowday'000000'
+nowhour=${nowhour}'0000'
+#nowhour='20230601140000'
+echo $nowhour
+echo ${nowstart}
+echo ${last7day}
+
+python extract_cur_share_log.py ${nowstart} ${nowhour}
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] simrecall extract_share_log"
+    sh sendmsg.sh  $nowday  $msg
+    echo "[ERROR] echo 'extract_share_log"
+    exit 255
+fi
+
+python filter_video.py ./data/user_cur_day_item_share_${nowhour} ./data/user_cur_day_item_share_filter_${nowhour}
+
+#python extract_share_log.py ${last7day} ${nowday}
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] simrecall extract_share_log"
+    sh sendmsg.sh  $nowday  $msg
+    echo "[ERROR] echo 'extract_share_log"
+    exit 255
+fi
+
+#nowday='20230505'
+#2. cal i2i result
+python calI2I2.py ${nowday}  ${nowhour}
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] simrecall calI2I.py"
+    #sh sendmsg.sh $nowday $msg
+    echo $msg
+    exit -1
+fi
+
+#3.import res
+python import_redist.py "./data/rec_result3_"${nowhour}".json"  "./data/redis_cls_"${nowhour}".json"
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] simhot recall import_redist.py"
+    sh sendmsg.sh  $nowday  $msg
+    echo $msg
+    exit -1
+fi
+echo 'finish sorted'

+ 36 - 0
sendmsg.sh

@@ -0,0 +1,36 @@
+#!/bin/bash
+nowdate=$1
+content=${@:2}
+content=${content//\<font color=\'red\'\>/}
+content=${content//\<\/font\>\<\/br\>/}
+content=${content//\ /}
+echo "warn content is : $content"
+
+api=https://open.feishu.cn/open-apis/bot/v2/hook/00cf9bb4-ecea-4f0d-bf02-1a20592a916c #飞书机器人webhook 地址
+
+
+curl -X POST \
+  $api \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "msg_type": "post",
+    "content": {
+        "post": {
+            "zh_cn": {
+                "title": "recall alert",
+                "content": [
+                    [
+                        {
+                            "tag": "text",
+                            "un_escape": true,
+                            "text": "'$content'"
+                        }
+                    ],
+                    [
+
+                    ]
+                ]
+            }
+        }
+    }
+}'

+ 30 - 0
test.py

@@ -0,0 +1,30 @@
+#coding utf-8
+import sys
+import json
+from db_help import RedisHelper
+
+if __name__=="__main__":
+     '''f2 = open(sys.argv[2], 'w')
+     with  open(sys.argv[1]) as f:
+         rec_json_list=json.load(f)
+         #print(rec_json_list)
+         import_data_dict = {}
+         for line in rec_json_list:
+             #print(line)
+             rec_list = line[1][:10]
+             rec_item_list = []
+             for rec_item in rec_list:
+                 rec_item_list.append((rec_item[0], round(rec_item[1],3)))
+             res_info = json.dumps(rec_item_list)
+             
+             f2.write(str(line[0])+"\t"+res_info+"\n")
+             key="sim_hot_"+line[0]
+             import_data_dict[key] = res_info'''
+     redis_helper = RedisHelper()
+         #redis_helper.update_batch_setnx_key(import_data_dict, 60*60*24*7)
+         #con = redis_helper.connect()
+     res = redis_helper.get_data_from_redis("hot_7day:")
+     print(res)
+     #f2.close()
+     #f.close()
+   

+ 35 - 0
test.sh

@@ -0,0 +1,35 @@
+#!/bin/bash
+source ~/.bash_profile
+source ~/.bashrc
+
+conda activate base 
+cd /data/rec_project/OffLineRec
+#1. download data
+nowday=`date  +"%Y%m%d%H" -d -0days`
+echo ${nowday} 
+#3.import res
+#mkdir -p ./data/
+
+nowday='2023051814'
+#python export_hour_vid.py ${nowday}
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] sorted extract_hour_log"
+    #sh sendmsg.sh  $nowday  $msg
+    echo "[ERROR] echo 'extract_hour_vid.py"
+    exit 255
+fi
+#python calHourCtr.py ${nowday}
+
+python compose_score.py ${nowday} 
+exit
+pytho calCtr.py ${nowday}
+if [ $? -ne 0 ];
+then
+    msg = "[ERROR] cal ctr "
+    sh sendmsg.sh  $nowday  $msg
+    echo "[ERROR] echo 'calCtr.py"
+    exit 255
+fi
+echo "finish sorted"
+

+ 11 - 0
test_faiss.py

@@ -0,0 +1,11 @@
+import numpy as np
+d = 64                                           # 向量维度
+nb = 100000                                      # index向量库的数据量
+nq = 10000                                       # 待检索query的数目
+np.random.seed(1234)             
+xb = np.random.random((nb, d)).astype('float32')
+#xb[:, 0] += np.arange(nb) / 1000.                # index向量库的向量
+xq = np.random.random((nq, d)).astype('float32')
+#xq[:, 0] += np.arange(nq) / 1000.
+
+print(xb)

+ 12 - 0
train_vec.sh

@@ -0,0 +1,12 @@
+cd /home/rec/project/git_project/OffLineRec
+
+#1.cut_title
+nowday=`date  +"%Y%m%d" -d -0days`
+nowday=20230512
+
+#python cut_title.py ./stopwords/ ./data/video_title_${nowday} ./data/video_cut_title_${nowday}
+
+#python word2vec.py 
+
+
+#python predict.py ./data/word2vec_cut_title  > ./data/video_title_embedding

+ 550 - 0
utils.py

@@ -0,0 +1,550 @@
+# coding:utf-8
+import pickle
+import os
+import requests
+import json
+import traceback
+import pandas as pd
+
+from odps import ODPS
+from config import set_config
+from db_help import  MysqlHelper, RedisHelper
+#from log import Log
+
+config_ = set_config()
+#log_ = Log()
+
+
+def execute_sql_from_odps(project, sql, connect_timeout=3000, read_timeout=500000,
+                       pool_maxsize=1000, pool_connections=1000):
+    odps = ODPS(
+        access_id=config_.ODPS_CONFIG['ACCESSID'],
+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
+        project=project,
+        endpoint=config_.ODPS_CONFIG['ENDPOINT'],
+        connect_timeout=connect_timeout,
+        read_timeout=read_timeout,
+        pool_maxsize=pool_maxsize,
+        pool_connections=pool_connections
+    )
+    records = odps.execute_sql(sql=sql)
+    return records
+
+
+def get_data_from_odps(date, project, table, connect_timeout=3000, read_timeout=500000,
+                       pool_maxsize=1000, pool_connections=1000):
+    """
+    从odps获取数据
+    :param date: 日期 type-string '%Y%m%d'
+    :param project: type-string
+    :param table: 表名 type-string
+    :param connect_timeout: 连接超时设置
+    :param read_timeout: 读取超时设置
+    :param pool_maxsize:
+    :param pool_connections:
+    :return: records
+    """
+    odps = ODPS(
+        access_id=config_.ODPS_CONFIG['ACCESSID'],
+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
+        project=project,
+        endpoint=config_.ODPS_CONFIG['ENDPOINT'],
+        connect_timeout=connect_timeout,
+        read_timeout=read_timeout,
+        pool_maxsize=pool_maxsize,
+        pool_connections=pool_connections
+    )
+    records = odps.read_table(name=table, partition='dt=%s' % date)
+    return records
+
+
+def check_table_partition_exits(date, project, table, connect_timeout=3000, read_timeout=500000,
+                                pool_maxsize=1000, pool_connections=1000):
+    """
+    判断表中是否存在这个分区
+    :param date: 日期 type-string '%Y%m%d'
+    :param project: type-string
+    :param table: 表名 type-string
+    :param connect_timeout: 连接超时设置
+    :param read_timeout: 读取超时设置
+    :param pool_maxsize:
+    :param pool_connections:
+    :return: records
+    """
+    odps = ODPS(
+        access_id=config_.ODPS_CONFIG['ACCESSID'],
+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
+        project=project,
+        endpoint=config_.ODPS_CONFIG['ENDPOINT'],
+        connect_timeout=connect_timeout,
+        read_timeout=read_timeout,
+        pool_maxsize=pool_maxsize,
+        pool_connections=pool_connections
+    )
+    t = odps.get_table(name=table)
+    return t.exist_partition(partition_spec=f'dt={date}')
+
+
+'''def write_to_pickle(data, filename, filepath=config_.DATA_DIR_PATH):
+    """
+    将数据写入pickle文件中
+    :param data: 数据
+    :param filename: 写入的文件名
+    :param filepath: 文件存放路径,默认为config_.DATA_DIR_PATH
+    :return: None
+    """
+    if not os.path.exists(filepath):
+        os.makedirs(filepath)
+    file = os.path.join(filepath, filename)
+    with open(file, 'wb') as wf:
+        pickle.dump(data, wf)
+
+
+def read_from_pickle(filename, filepath=config_.DATA_DIR_PATH):
+    """
+    从pickle文件读取数据
+    :param filename: 文件名
+    :param filepath: 文件存放路径,默认为config_.DATA_DIR_PATH
+    :return: data
+    """
+    file = os.path.join(filepath, filename)
+    if not os.path.exists(file):
+        return None
+    with open(file, 'rb') as rf:
+        data = pickle.load(rf)
+    return data '''
+
+
+def send_msg_to_feishu(webhook, key_word, msg_text):
+    """发送消息到飞书"""
+    headers = {'Content-Type': 'application/json'}
+    payload_message = {
+        "msg_type": "text",
+        "content": {
+            "text": '{}: {}'.format(key_word, msg_text)
+        }
+    }
+    response = requests.request('POST', url=webhook, headers=headers, data=json.dumps(payload_message))
+    print(response.text)
+
+
+def send_msg_to_feishu_new(webhook, key_word, title, msg_list):
+    """发送消息到飞书"""
+    headers = {'Content-Type': 'application/json'}
+    content_list = [
+        [
+            {
+                "tag": "text",
+                "text": msg
+            }
+        ]
+        for msg in msg_list
+    ]
+    payload_message = {
+        "msg_type": "post",
+        "content": {
+            "post": {
+                "zh_cn": {
+                    "title": f"{key_word}: {title}",
+                    "content": content_list,
+                }
+            }
+        }
+    }
+    response = requests.request('POST', url=webhook, headers=headers, data=json.dumps(payload_message))
+    print(response.text)
+
+
+def request_post(request_url, request_data=None, **kwargs):
+    """
+    post 请求 HTTP接口
+    :param request_url: 接口URL
+    :param request_data: 请求参数
+    :return: res_data json格式
+    """
+    try:
+        response = requests.post(url=request_url, json=request_data, **kwargs)
+        if response.status_code == 200:
+            res_data = json.loads(response.text)
+            return res_data
+        else:
+            #log_.info(f"response.status_code: {response.status_code}")
+            return None
+    except Exception as e:
+        #log_.error('url: {}, exception: {}, traceback: {}'.format(request_url, e, traceback.format_exc()))
+        send_msg_to_feishu(
+            webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'),
+            key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'),
+            msg_text='rov-offline{} - 接口请求失败:{}, exception: {}'.format(config_.ENV_TEXT, request_url, e)
+        )
+        return None
+
+
+def request_get(request_url):
+    """
+    get 请求 HTTP接口
+    :param request_url: 接口URL
+    :return: res_data json格式
+    """
+    try:
+        response = requests.get(url=request_url)
+        if response.status_code == 200:
+            res_data = json.loads(response.text)
+            return res_data
+        else:
+            #log_.info(f"response.status_code: {response.status_code}")
+            return None
+    except Exception as e:
+        #log_.error('url: {}, exception: {}, traceback: {}'.format(request_url, e, traceback.format_exc()))
+        send_msg_to_feishu(
+            webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'),
+            key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'),
+            msg_text='rov-offline{} - 接口请求失败:{}, exception: {}'.format(config_.ENV_TEXT, request_url, e)
+        )
+        return None
+
+
+def data_normalization(data):
+    """
+    对结果做归一化处理(Min-Max Normalization),将分数控制在[0, 100]
+    :param data: type-list
+    :return: normal_data, type-list 归一化后的数据
+    """
+    x_max = max(data)
+    x_min = min(data)
+    normal_data = [(x-x_min)/(x_max-x_min)*100 for x in data]
+    return normal_data
+
+
+def filter_video_status(video_ids):
+    """
+    对视频状态进行过滤
+    :param video_ids: 视频id列表 type-list
+    :return: filtered_videos
+    """
+    i = 0
+    while i < 3:
+        try:
+            mysql_helper = MysqlHelper(mysql_info=config_.FILTER_MYSQL_INFO)
+            video_status_sql = "SELECT t1.id AS 'video_id', " \
+                               "t1.transcode_status AS 'transcoding_status', " \
+                               "t2.audit_status AS 'audit_status', " \
+                               "t2.video_status AS 'open_status', " \
+                               "t2.recommend_status AS 'applet_rec_status', " \
+                               "t2.app_recommend_status AS 'app_rec_status', " \
+                               "t3.charge AS 'payment_status', " \
+                               "case when t4.max_validate_count is null then 0 else t4.max_validate_count end AS 'encryption_status' " \
+                               "FROM longvideo.wx_video t1 " \
+                               "LEFT JOIN longvideo.wx_video_status t2 ON t1.id= t2.video_id " \
+                               "LEFT JOIN longvideo.wx_video_detail t3 ON t1.id= t3.video_id " \
+                               "LEFT JOIN longvideo.wx_video_pwd t4 ON t1.id= t4.video_id"
+            if len(video_ids) == 1:
+                sql = "SELECT video_id " \
+                      "FROM ({}) " \
+                      "WHERE audit_status = 5 " \
+                      "AND applet_rec_status IN (1, -6) " \
+                      "AND open_status = 1 " \
+                      "AND payment_status = 0 " \
+                      "AND encryption_status != 5 " \
+                      "AND transcoding_status = 3 " \
+                      "AND video_id IN ({});".format(video_status_sql, video_ids[0])
+                data = mysql_helper.get_data(sql=sql)
+
+            else:
+                data = []
+                for i in range(len(video_ids) // 200 + 1):
+                    sql = "SELECT video_id " \
+                          "FROM ({}) " \
+                          "WHERE audit_status = 5 " \
+                          "AND applet_rec_status IN (1, -6) " \
+                          "AND open_status = 1 " \
+                          "AND payment_status = 0 " \
+                          "AND encryption_status != 5 " \
+                          "AND transcoding_status = 3 " \
+                          "AND video_id IN {};".format(video_status_sql, tuple(video_ids[i*200:(i+1)*200]))
+                    select_res = mysql_helper.get_data(sql=sql)
+                    if select_res is not None:
+                        data += select_res
+            filtered_videos = [int(temp[0]) for temp in data]
+            return filtered_videos
+
+        except Exception as e:
+            #log_.error(f"过滤失败, exception: {e}, traceback: {traceback.format_exc()}")
+            send_msg_to_feishu(
+                webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'),
+                key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'),
+                msg_text=f"rov-offline{config_.ENV_TEXT} - 过滤失败\n"
+                         f"retry count: {i}\n"
+                         f"exception: {e}\n"
+                         f"traceback: {traceback.format_exc()}"
+            )
+            i += 1
+            if i == 1:
+                return video_ids
+
+
+def filter_video_status_with_applet_rec(video_ids, applet_rec_status):
+    """
+    对视频状态进行过滤
+    :param video_ids: 视频id列表 type-list
+    :param applet_rec_status: 小程序推荐状态 -6:待推荐 1:普通推荐
+    :return: filtered_videos
+    """
+    i = 0
+    while i < 3:
+        try:
+            mysql_helper = MysqlHelper(mysql_info=config_.FILTER_MYSQL_INFO)
+            video_status_sql = "SELECT t1.id AS 'video_id', " \
+                               "t1.transcode_status AS 'transcoding_status', " \
+                               "t2.audit_status AS 'audit_status', " \
+                               "t2.video_status AS 'open_status', " \
+                               "t2.recommend_status AS 'applet_rec_status', " \
+                               "t2.app_recommend_status AS 'app_rec_status', " \
+                               "t3.charge AS 'payment_status', " \
+                               "case when t4.max_validate_count is null then 0 else t4.max_validate_count end AS 'encryption_status' " \
+                               "FROM longvideo.wx_video t1 " \
+                               "LEFT JOIN longvideo.wx_video_status t2 ON t1.id= t2.video_id " \
+                               "LEFT JOIN longvideo.wx_video_detail t3 ON t1.id= t3.video_id " \
+                               "LEFT JOIN longvideo.wx_video_pwd t4 ON t1.id= t4.video_id"
+            if len(video_ids) == 1:
+                sql = "SELECT video_id " \
+                      "FROM ({}) " \
+                      "WHERE audit_status = 5 " \
+                      "AND applet_rec_status = {} " \
+                      "AND open_status = 1 " \
+                      "AND payment_status = 0 " \
+                      "AND encryption_status != 5 " \
+                      "AND transcoding_status = 3 " \
+                      "AND video_id IN ({});".format(video_status_sql, applet_rec_status, video_ids[0])
+                data = mysql_helper.get_data(sql=sql)
+
+            else:
+                data = []
+                for i in range(len(video_ids) // 200 + 1):
+                    sql = "SELECT video_id " \
+                          "FROM ({}) " \
+                          "WHERE audit_status = 5 " \
+                          "AND applet_rec_status = {} " \
+                          "AND open_status = 1 " \
+                          "AND payment_status = 0 " \
+                          "AND encryption_status != 5 " \
+                          "AND transcoding_status = 3 " \
+                          "AND video_id IN {};".format(video_status_sql, applet_rec_status,
+                                                       tuple(video_ids[i*200:(i+1)*200]))
+                    select_res = mysql_helper.get_data(sql=sql)
+                    if select_res is not None:
+                        data += select_res
+            filtered_videos = [int(temp[0]) for temp in data]
+            return filtered_videos
+
+        except Exception as e:
+            #log_.error(f"过滤失败, exception: {e}, traceback: {traceback.format_exc()}")
+            send_msg_to_feishu(
+                webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'),
+                key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'),
+                msg_text=f"rov-offline{config_.ENV_TEXT} - 过滤失败\n"
+                         f"retry count: {i}\n"
+                         f"exception: {e}\n"
+                         f"traceback: {traceback.format_exc()}"
+            )
+            i += 1
+            if i == 1:
+                return video_ids
+
+
+def filter_video_status_app(video_ids):
+    """
+    对视频状态进行过滤 - app
+    :param video_ids: 视频id列表 type-list
+    :return: filtered_videos
+    """
+    i = 0
+    while i < 3:
+        try:
+            mysql_helper = MysqlHelper(mysql_info=config_.FILTER_MYSQL_INFO)
+            video_status_sql = "SELECT t1.id AS 'video_id', " \
+                               "t1.transcode_status AS 'transcoding_status', " \
+                               "t2.app_audit_status AS 'app_audit_status', " \
+                               "t2.original_status AS 'open_status', " \
+                               "t2.recommend_status AS 'applet_rec_status', " \
+                               "t2.app_recommend_status AS 'app_rec_status', " \
+                               "t3.charge AS 'payment_status', " \
+                               "case when t4.max_validate_count is null then 0 else t4.max_validate_count end AS 'encryption_status' " \
+                               "FROM longvideo.wx_video t1 " \
+                               "LEFT JOIN longvideo.wx_video_status t2 ON t1.id= t2.video_id " \
+                               "LEFT JOIN longvideo.wx_video_detail t3 ON t1.id= t3.video_id " \
+                               "LEFT JOIN longvideo.wx_video_pwd t4 ON t1.id= t4.video_id"
+
+            if len(video_ids) == 1:
+                sql = "SELECT video_id " \
+                      "FROM ({}) " \
+                      "WHERE app_audit_status = 5 " \
+                      "AND app_rec_status IN (1, -6, 10) " \
+                      "AND open_status = 1 " \
+                      "AND payment_status = 0 " \
+                      "AND encryption_status != 5 " \
+                      "AND transcoding_status = 3 " \
+                      "AND video_id IN ({});".format(video_status_sql, video_ids[0])
+                data = mysql_helper.get_data(sql=sql)
+
+            else:
+                data = []
+                for i in range(len(video_ids) // 200 + 1):
+                    sql = "SELECT video_id " \
+                          "FROM ({}) " \
+                          "WHERE app_audit_status = 5 " \
+                          "AND app_rec_status IN (1, -6, 10) " \
+                          "AND open_status = 1 " \
+                          "AND payment_status = 0 " \
+                          "AND encryption_status != 5 " \
+                          "AND transcoding_status = 3 " \
+                          "AND video_id IN {};".format(video_status_sql, tuple(video_ids[i*200:(i+1)*200]))
+                    select_res = mysql_helper.get_data(sql=sql)
+                    if select_res is not None:
+                        data += select_res
+
+            filtered_videos = [int(temp[0]) for temp in data]
+            return filtered_videos
+
+        except Exception as e:
+            #log_.error(f"过滤失败, exception: {e}, traceback: {traceback.format_exc()}")
+            send_msg_to_feishu(
+                webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'),
+                key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'),
+                msg_text=f"rov-offline{config_.ENV_TEXT} - 过滤失败\n"
+                         f"retry count: {i}\n"
+                         f"exception: {e}\n"
+                         f"traceback: {traceback.format_exc()}"
+            )
+            i += 1
+            if i == 1:
+                return video_ids
+
+
+def filter_shield_video(video_ids, shield_key_name_list):
+    """
+    过滤屏蔽视频视频
+    :param video_ids: 需过滤的视频列表 type-list
+    :param shield_key_name_list: 过滤视频 redis-key
+    :return: filtered_videos  过滤后的列表  type-list
+    """
+    if len(video_ids) == 0:
+        return video_ids
+    # 根据Redis缓存中的数据过滤
+    redis_helper = RedisHelper()
+    for shield_key_name in shield_key_name_list:
+        shield_videos_list = redis_helper.get_data_from_set(key_name=shield_key_name)
+        if not shield_videos_list:
+            continue
+        shield_videos = [int(video) for video in shield_videos_list]
+        video_ids = [int(video_id) for video_id in video_ids if int(video_id) not in shield_videos]
+
+    return video_ids
+
+
+def filter_political_videos(video_ids):
+    """
+    过滤涉政视频
+    :param video_ids: 需过滤的视频列表 type-list
+    :return: filtered_video_ids  过滤后的列表  type-list
+    """
+    if len(video_ids) == 0:
+        return video_ids
+    # 根据Redis缓存中的数据过滤
+    redis_helper = RedisHelper()
+    political_key_name = config_.POLITICAL_VIDEOS_KEY_NAME
+    political_videos_list = redis_helper.get_data_from_set(key_name=political_key_name)
+    if not political_videos_list:
+        return video_ids
+    political_videos = [int(video) for video in political_videos_list]
+    filtered_video_ids = [int(video_id) for video_id in video_ids if int(video_id) not in political_videos]
+
+    return filtered_video_ids
+
+
+def update_video_w_h_rate(video_ids, key_name):
+    """
+    获取横屏视频的宽高比,并存入redis中 (width/height>1)
+    :param video_ids: videoId列表 type-list
+    :param key_name: redis key
+    :return: None
+    """
+    # 获取数据
+    if len(video_ids) == 1:
+        sql = "SELECT id, width, height, rotate FROM longvideo.wx_video WHERE id = {};".format(video_ids[0])
+    else:
+        sql = "SELECT id, width, height, rotate FROM longvideo.wx_video WHERE id IN {};".format(tuple(video_ids))
+
+    mysql_helper = MysqlHelper(mysql_info=config_.MYSQL_INFO)
+    data = mysql_helper.get_data(sql=sql)
+
+    # 更新到redis
+    info_data = {}
+    for video_id, width, height, rotate in data:
+        if int(width) == 0 or int(height) == 0:
+            continue
+        # rotate 字段值为 90或270时,width和height的值相反
+        if int(rotate) in (90, 270):
+            w_h_rate = int(height) / int(width)
+        else:
+            w_h_rate = int(width) / int(height)
+        if w_h_rate > 1:
+            info_data[int(video_id)] = w_h_rate
+    redis_helper = RedisHelper()
+    # 删除旧数据
+    redis_helper.del_keys(key_name=key_name)
+    # 写入新数据
+    if len(info_data) > 0:
+        redis_helper.add_data_with_zset(key_name=key_name, data=info_data)
+
+
+def data_check(project, table, dt):
+    """检查数据是否准备好"""
+    odps = ODPS(
+        access_id=config_.ODPS_CONFIG['ACCESSID'],
+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
+        project=project,
+        endpoint=config_.ODPS_CONFIG['ENDPOINT'],
+        connect_timeout=3000,
+        read_timeout=500000,
+        pool_maxsize=1000,
+        pool_connections=1000
+    )
+
+    try:
+        check_res = check_table_partition_exits(date=dt, project=project, table=table)
+        if check_res:
+            sql = f'select * from {project}.{table} where dt = {dt}'
+            with odps.execute_sql(sql=sql).open_reader() as reader:
+                data_count = reader.count
+        else:
+            data_count = 0
+    except Exception as e:
+        data_count = 0
+    return data_count
+
+
+def get_feature_data(project, table, features, dt):
+    """获取特征数据"""
+    records = get_data_from_odps(date=dt, project=project, table=table)
+    feature_data = []
+    for record in records:
+        item = {}
+        for feature_name in features:
+            item[feature_name] = record[feature_name]
+        feature_data.append(item)
+    feature_df = pd.DataFrame(feature_data)
+    return feature_df
+
+
+if __name__ == '__main__':
+    # data_test = [9.20273281e+03, 7.00795065e+03, 5.54813112e+03, 9.97402494e-01, 9.96402495e-01, 9.96402494e-01]
+    # data_normalization(data_test)
+    # request_post(request_url=config_.NOTIFY_BACKEND_UPDATE_ROV_SCORE_URL, request_data={'videos': []})
+    # video_ids = [110, 112, 113, 115, 116, 117, 8289883]
+    # update_video_w_h_rate(video_ids=video_ids, key_name='')
+    project = config_.PROJECT_24H_APP_TYPE
+    table = config_.TABLE_24H_APP_TYPE
+    dt = '2022080115'
+    check_res = check_table_partition_exits(date=dt, project=project, table=table)
+    print(check_res)

+ 28 - 0
word2vec.py

@@ -0,0 +1,28 @@
+#coding utf-8
+import sys
+from gensim.models import word2vec
+
+
+if __name__=="__main__":
+    f = open(sys.argv[1])
+    arr = []
+    num = 0
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        num = num+1
+        if  num == 1:
+            continue
+        items = line.strip().split("\t")
+        #print(items)
+        if len(items)<2:
+            continue
+        arr.append(items[1].split(" "))
+        #print(arr)
+    f.close()
+    model = word2vec.Word2Vec(arr, vector_size=64, min_count=2,sg=1, workers=10)
+    model.wv.save_word2vec_format('word2vec.txt',binary=False)
+    #model.save('word2vec.model')
+
+