2 years ago · 53f47c68c9
--- a/cut_title.py
+++ b/cut_title.py
@@ -0,0 +1,47 @@
 
				+#coding utf-8
			
 
				+import sys
			
 
				+import jieba 
			
 
				+import os
			
 
				+
			
 
				+if __name__=="__main__":
			
 
				+    #f1 = open(sys.argv[1])
			
 
				+    stop_words = set('')
			
 
				+    path = sys.argv[1]
			
 
				+    files_dir = os.listdir(path)
			
 
				+    #print(files_dir)
			
 
				+    for file_name in files_dir:
			
 
				+        if file_name.find('.txt')>-1:
			
 
				+            f1 = open(path+"/"+file_name)
			
 
				+            while True:
			
 
				+                file_line = f1.readline()
			
 
				+                if not file_line:
			
 
				+                    break
			
 
				+                file_line = file_line.strip()
			
 
				+                stop_words.add(file_line)
			
 
				+            f1.close()
			
 
				+    #print(len(stop_words))
			
 
				+    f = open(sys.argv[2])
			
 
				+    f3 = open(sys.argv[3], 'w')
			
 
				+    while True:
			
 
				+        line = f.readline()
			
 
				+        if not line:
			
 
				+            break
			
 
				+        line = line.strip()
			
 
				+        items = line.split("\t")
			
 
				+        if len(items)<2:
			
 
				+            continue
			
 
				+        vid = items[0]
			
 
				+        title = items[1] 
			
 
				+        cut_info =  jieba.lcut(title)
			
 
				+        cut_arr = []
			
 
				+        for cut_item in cut_info:
			
 
				+            #print("cut_item:", cut_item)
			
 
				+            if cut_item==' ':
			
 
				+                continue
			
 
				+            if cut_item in stop_words:
			
 
				+                continue
			
 
				+            cut_arr.append(cut_item)
			
 
				+        vid_info = vid+'\t'+" ".join(cut_arr)
			
 
				+        f3.write(vid_info.strip()+"\n")
			
 
				+    f3.close()
			
 
				+       
			
--- a/extract_user_action.py
+++ b/extract_user_action.py
@@ -0,0 +1,54 @@
 
				+#coding utf-8
			
 
				+from odps import ODPS
			
 
				+from config import set_config
			
 
				+import datetime
			
 
				+import pandas as pd
			
 
				+from collections import defaultdict
			
 
				+import sys
			
 
				+
			
 
				+config_ = set_config()
			
 
				+
			
 
				+odps = ODPS(
			
 
				+        access_id=config_.ODPS_CONFIG['ACCESSID'],
			
 
				+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
			
 
				+        project="loghubods",
			
 
				+        endpoint=config_.ODPS_CONFIG['ENDPOINT'])
			
 
				+
			
 
				+
			
 
				+def get_data_from_odps(date, project, table, connect_timeout=3000, read_timeout=500000,
			
 
				+                       pool_maxsize=1000, pool_connections=1000):
			
 
				+    """
			
 
				+    从odps获取数据
			
 
				+    :param date: 日期 type-string '%Y%m%d'
			
 
				+    :param project: type-string
			
 
				+    :param table: 表名 type-string
			
 
				+    :param connect_timeout: 连接超时设置
			
 
				+    :param read_timeout: 读取超时设置
			
 
				+    :param pool_maxsize:
			
 
				+    :param pool_connections:
			
 
				+    :return: records
			
 
				+    """
			
 
				+    records = odps.read_table(name=table, partition='dt=%s' % date)
			
 
				+    return records
			
 
				+
			
 
				+def exe_sql(sql):    
			
 
				+    data = []
			
 
				+    with odps.execute_sql(sql).open_reader() as reader:
			
 
				+        d = defaultdict(list)  # collection默认一个dict
			
 
				+        for record in reader:
			
 
				+            for res in record:
			
 
				+                d[res[0]].append(res[1])  # 解析record中的每一个元组，存储方式为(k,v)，以k作为key，存储每一列的内容；
			
 
				+        data = pd.DataFrame.from_dict(d, orient='index').T  # 转换为数据框，并转置，不转置的话是横条数据
			
 
				+    return data
			
 
				+
			
 
				+
			
 
				+if __name__=="__main__":
			
 
				+    project = 'loghubods'
			
 
				+    last7day=sys.argv[1]
			
 
				+    now_date=sys.argv[2]
			
 
				+    print("now date:", now_date)
			
 
				+    table = 'user_action_log_base'
			
 
				+    sql = "select  mid, videoid, businesstype, clienttimestamp, return from loghubods.user_action_log_base where dt between '"+last7day+"' and '"+now_date+"' and businesstype in ('videoShareFriend');"
			
 
				+    print(sql)
			
 
				+    data = exe_sql(sql)
			
 
				+    data.to_csv("./data/user_action_"+now_date, sep='\t') 
			
--- a/extract_video_info.py
+++ b/extract_video_info.py
@@ -0,0 +1,53 @@
 
				+#coding utf-8
			
 
				+from odps import ODPS
			
 
				+from config import set_config
			
 
				+import datetime
			
 
				+import pandas as pd
			
 
				+from collections import defaultdict
			
 
				+import sys
			
 
				+
			
 
				+config_ = set_config()
			
 
				+
			
 
				+odps = ODPS(
			
 
				+        access_id=config_.ODPS_CONFIG['ACCESSID'],
			
 
				+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
			
 
				+        project="loghubods",
			
 
				+        endpoint=config_.ODPS_CONFIG['ENDPOINT'])
			
 
				+
			
 
				+
			
 
				+def get_data_from_odps(date, project, table, connect_timeout=3000, read_timeout=500000,
			
 
				+                       pool_maxsize=1000, pool_connections=1000):
			
 
				+    """
			
 
				+    从odps获取数据
			
 
				+    :param date: 日期 type-string '%Y%m%d'
			
 
				+    :param project: type-string
			
 
				+    :param table: 表名 type-string
			
 
				+    :param connect_timeout: 连接超时设置
			
 
				+    :param read_timeout: 读取超时设置
			
 
				+    :param pool_maxsize:
			
 
				+    :param pool_connections:
			
 
				+    :return: records
			
 
				+    """
			
 
				+    records = odps.read_table(name=table, partition='dt=%s' % date)
			
 
				+    return records
			
 
				+
			
 
				+def exe_sql(sql):    
			
 
				+    data = []
			
 
				+    with odps.execute_sql(sql).open_reader() as reader:
			
 
				+        d = defaultdict(list)  # collection默认一个dict
			
 
				+        for record in reader:
			
 
				+            for res in record:
			
 
				+                d[res[0]].append(res[1])  # 解析record中的每一个元组，存储方式为(k,v)，以k作为key，存储每一列的内容；
			
 
				+        data = pd.DataFrame.from_dict(d, orient='index').T  # 转换为数据框，并转置，不转置的话是横条数据
			
 
				+    return data
			
 
				+
			
 
				+
			
 
				+if __name__=="__main__":
			
 
				+    project = 'loghubods'
			
 
				+    now_date=sys.argv[1]
			
 
				+    print("now date:", now_date)
			
 
				+    table = 'video_data_each_hour_dataset_24h_total_apptype'
			
 
				+    sql = "select id, title, video_path, cover_img_path,self_cover_img_path,play_count, share_count, reported_count, favoriteds, total_time, tag_count,stage_recommend_examine_status, sensitive_status, new_share_image_path from videoods.wx_video_per1h where status=1 and examine_status=1 ";
			
 
				+    print(sql)
			
 
				+    data = exe_sql(sql)
			
 
				+    data.to_csv("./data/video_data_info_"+now_date, sep='\t', index=None) 
			
--- a/run_extract_tag.sh
+++ b/run_extract_tag.sh
@@ -0,0 +1,32 @@
 
				+#!/bin/bash
			
 
				+source ~/.bash_profile
			
 
				+source ~/.bashrc
			
 
				+
			
 
				+conda activate python36 
			
 
				+
			
 
				+#1. download data
			
 
				+nowday=`date  +"%Y%m%d" -d -0days`
			
 
				+last7day=`date  +"%Y%m%d" -d -1days`
			
 
				+echo ${nowday} 
			
 
				+#3.import res
			
 
				+mkdir -p ./data/
			
 
				+
			
 
				+#python extract_user_action.py  ${last7day} ${nowday}
			
 
				+#if [ $? -ne 0 ];
			
 
				+#then
			
 
				+#    msg = "[ERROR] sorted extract_vid_log"
			
 
				+#    sh sendmsg.sh  $nowday  $msg
			
 
				+#    echo "[ERROR] echo 'extract_vid.py"
			
 
				+#    exit 255
			
 
				+#fi
			
 
				+
			
 
				+python extract_video_info.py ${nowday}
			
 
				+#if [ $? -ne 0 ];
			
 
				+#then
			
 
				+#    msg = "[ERROR] cal ctr "
			
 
				+#    sh sendmsg.sh  $nowday  $msg
			
 
				+#    echo "[ERROR] echo 'calCtr.py"
			
 
				+#    exit 255
			
 
				+#fi
			
 
				+#echo "finish sorted"
			
 
				+
			
--- a/stopwords
+++ b/stopwords
@@ -0,0 +1 @@
 
				+Subproject commit fbc0150f746e2757324ef67b15e3b347079a7e9d
			
--- a/word2vec.py
+++ b/word2vec.py
@@ -0,0 +1,12 @@
 
				+#coding utf-8
			
 
				+import sys
			
 
				+from gensim.models import word2vec
			
 
				+
			
 
				+
			
 
				+if __name__=="__main__":
			
 
				+    f = open(sys.argv[1])
			
 
				+    while True:
			
 
				+        line = f.readlin()
			
 
				+        if not line:
			
 
				+            break
			
 
				+
		`@@ -0,0 +1 @@`
		`+Subproject commit fbc0150f746e2757324ef67b15e3b347079a7e9d`