Browse Source

update title

linfan 1 year ago
parent
commit
53f47c68c9
6 changed files with 199 additions and 0 deletions
  1. 47 0
      cut_title.py
  2. 54 0
      extract_user_action.py
  3. 53 0
      extract_video_info.py
  4. 32 0
      run_extract_tag.sh
  5. 1 0
      stopwords
  6. 12 0
      word2vec.py

+ 47 - 0
cut_title.py

@@ -0,0 +1,47 @@
+#coding utf-8
+import sys
+import jieba 
+import os
+
+if __name__=="__main__":
+    #f1 = open(sys.argv[1])
+    stop_words = set('')
+    path = sys.argv[1]
+    files_dir = os.listdir(path)
+    #print(files_dir)
+    for file_name in files_dir:
+        if file_name.find('.txt')>-1:
+            f1 = open(path+"/"+file_name)
+            while True:
+                file_line = f1.readline()
+                if not file_line:
+                    break
+                file_line = file_line.strip()
+                stop_words.add(file_line)
+            f1.close()
+    #print(len(stop_words))
+    f = open(sys.argv[2])
+    f3 = open(sys.argv[3], 'w')
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        line = line.strip()
+        items = line.split("\t")
+        if len(items)<2:
+            continue
+        vid = items[0]
+        title = items[1] 
+        cut_info =  jieba.lcut(title)
+        cut_arr = []
+        for cut_item in cut_info:
+            #print("cut_item:", cut_item)
+            if cut_item==' ':
+                continue
+            if cut_item in stop_words:
+                continue
+            cut_arr.append(cut_item)
+        vid_info = vid+'\t'+" ".join(cut_arr)
+        f3.write(vid_info.strip()+"\n")
+    f3.close()
+       

+ 54 - 0
extract_user_action.py

@@ -0,0 +1,54 @@
+#coding utf-8
+from odps import ODPS
+from config import set_config
+import datetime
+import pandas as pd
+from collections import defaultdict
+import sys
+
+config_ = set_config()
+
+odps = ODPS(
+        access_id=config_.ODPS_CONFIG['ACCESSID'],
+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
+        project="loghubods",
+        endpoint=config_.ODPS_CONFIG['ENDPOINT'])
+
+
+def get_data_from_odps(date, project, table, connect_timeout=3000, read_timeout=500000,
+                       pool_maxsize=1000, pool_connections=1000):
+    """
+    从odps获取数据
+    :param date: 日期 type-string '%Y%m%d'
+    :param project: type-string
+    :param table: 表名 type-string
+    :param connect_timeout: 连接超时设置
+    :param read_timeout: 读取超时设置
+    :param pool_maxsize:
+    :param pool_connections:
+    :return: records
+    """
+    records = odps.read_table(name=table, partition='dt=%s' % date)
+    return records
+
+def exe_sql(sql):    
+    data = []
+    with odps.execute_sql(sql).open_reader() as reader:
+        d = defaultdict(list)  # collection默认一个dict
+        for record in reader:
+            for res in record:
+                d[res[0]].append(res[1])  # 解析record中的每一个元组,存储方式为(k,v),以k作为key,存储每一列的内容;
+        data = pd.DataFrame.from_dict(d, orient='index').T  # 转换为数据框,并转置,不转置的话是横条数据
+    return data
+
+
+if __name__=="__main__":
+    project = 'loghubods'
+    last7day=sys.argv[1]
+    now_date=sys.argv[2]
+    print("now date:", now_date)
+    table = 'user_action_log_base'
+    sql = "select  mid, videoid, businesstype, clienttimestamp, return from loghubods.user_action_log_base where dt between '"+last7day+"' and '"+now_date+"' and businesstype in ('videoShareFriend');"
+    print(sql)
+    data = exe_sql(sql)
+    data.to_csv("./data/user_action_"+now_date, sep='\t') 

+ 53 - 0
extract_video_info.py

@@ -0,0 +1,53 @@
+#coding utf-8
+from odps import ODPS
+from config import set_config
+import datetime
+import pandas as pd
+from collections import defaultdict
+import sys
+
+config_ = set_config()
+
+odps = ODPS(
+        access_id=config_.ODPS_CONFIG['ACCESSID'],
+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
+        project="loghubods",
+        endpoint=config_.ODPS_CONFIG['ENDPOINT'])
+
+
+def get_data_from_odps(date, project, table, connect_timeout=3000, read_timeout=500000,
+                       pool_maxsize=1000, pool_connections=1000):
+    """
+    从odps获取数据
+    :param date: 日期 type-string '%Y%m%d'
+    :param project: type-string
+    :param table: 表名 type-string
+    :param connect_timeout: 连接超时设置
+    :param read_timeout: 读取超时设置
+    :param pool_maxsize:
+    :param pool_connections:
+    :return: records
+    """
+    records = odps.read_table(name=table, partition='dt=%s' % date)
+    return records
+
+def exe_sql(sql):    
+    data = []
+    with odps.execute_sql(sql).open_reader() as reader:
+        d = defaultdict(list)  # collection默认一个dict
+        for record in reader:
+            for res in record:
+                d[res[0]].append(res[1])  # 解析record中的每一个元组,存储方式为(k,v),以k作为key,存储每一列的内容;
+        data = pd.DataFrame.from_dict(d, orient='index').T  # 转换为数据框,并转置,不转置的话是横条数据
+    return data
+
+
+if __name__=="__main__":
+    project = 'loghubods'
+    now_date=sys.argv[1]
+    print("now date:", now_date)
+    table = 'video_data_each_hour_dataset_24h_total_apptype'
+    sql = "select id, title, video_path, cover_img_path,self_cover_img_path,play_count, share_count, reported_count, favoriteds, total_time, tag_count,stage_recommend_examine_status, sensitive_status, new_share_image_path from videoods.wx_video_per1h where status=1 and examine_status=1 ";
+    print(sql)
+    data = exe_sql(sql)
+    data.to_csv("./data/video_data_info_"+now_date, sep='\t', index=None) 

+ 32 - 0
run_extract_tag.sh

@@ -0,0 +1,32 @@
+#!/bin/bash
+source ~/.bash_profile
+source ~/.bashrc
+
+conda activate python36 
+
+#1. download data
+nowday=`date  +"%Y%m%d" -d -0days`
+last7day=`date  +"%Y%m%d" -d -1days`
+echo ${nowday} 
+#3.import res
+mkdir -p ./data/
+
+#python extract_user_action.py  ${last7day} ${nowday}
+#if [ $? -ne 0 ];
+#then
+#    msg = "[ERROR] sorted extract_vid_log"
+#    sh sendmsg.sh  $nowday  $msg
+#    echo "[ERROR] echo 'extract_vid.py"
+#    exit 255
+#fi
+
+python extract_video_info.py ${nowday}
+#if [ $? -ne 0 ];
+#then
+#    msg = "[ERROR] cal ctr "
+#    sh sendmsg.sh  $nowday  $msg
+#    echo "[ERROR] echo 'calCtr.py"
+#    exit 255
+#fi
+#echo "finish sorted"
+

+ 1 - 0
stopwords

@@ -0,0 +1 @@
+Subproject commit fbc0150f746e2757324ef67b15e3b347079a7e9d

+ 12 - 0
word2vec.py

@@ -0,0 +1,12 @@
+#coding utf-8
+import sys
+from gensim.models import word2vec
+
+
+if __name__=="__main__":
+    f = open(sys.argv[1])
+    while True:
+        line = f.readlin()
+        if not line:
+            break
+