sina-scrap-for-gep/cleandata.py at master · rogerclarkgc/sina-scrap-for-gep · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# coding:utf-8
import string
import re
import textwrap
import pickle

from tqdm import tqdm
import jieba


from pymongo import MongoClient


def cleancomment(comment=None, punctuation=True):
    """
    clean the comment, remove non chinese characters
    :param punctuation:boolean value, if True, will not exclude punctuation
    :param comment:the comment string
    :return:string object, the comment after cleaning
    """
    # basic chinese characters and punctuations
    pattern = '[\u4e00-\u9fa5]'
    if punctuation:
        pattern = '[\u4e00-\u9fa5,。,，,！,？,：,、]'
    other = ['网页链接', '秒拍视频', '展开全文']
    for i in other:
        if i in comment:
            comment = comment.replace(i, '')
        else:
            continue
    result = re.findall(pattern, comment)
    if len(result)==0:
        return ''
    else:
        clean = "".join(result)
        return clean
def remove_stop(cutlist=None, stopword=None):
    """

    :param cutlist:the cut list of an comment
    :param stopword:the list of stop word
    :return:
    """
    if not stopword:
        #print('使用自建词典...')
        stopdic = loadpkl('stop_list.pickle')
    else:
        #print('使用用户提供词典...')
        stopdic = stopword
    nonstop = [word for word in cutlist if word not in stopdic]
    return nonstop


def mergecomment(cursor=None, merge = False, split='\n', punc = False):
    """
    merge all comment in one string object
    :param cursor: pymongo cursor object
    :param merge: merge all doc in one string object
    :param split: the split character of every comment
    :param punc: if true, will not exclude punctuations in comments
    :return: the string object or list object
    """
    result = None
    print('当前cursor中的文档数目为：{}'.format(cursor.count()))
    print('开始执行文档合并操作，分隔符号为：{}'.format(split))
    result = [cleancomment(doc['comment'], punctuation=punc) for doc in cursor]
    print('合并完成！')
    if merge:
        return split.join(result)
    else:
        return result

def wordjoin(cutlist=None, sep=" "):
    """
    use sep to link all sentence in cut list
    :param cutlist: the list of cut result for every sentence
    :param sep: the character to seperate every word
    :return:the str object
    """
    join = sep.join([sep.join(cut) for cut in cutlist])
    return join

def writepkl(doc=None, name='data.pickle'):
    """
    write .pickle files in work directory
    :param doc:object to store
    :param name:the name of files
    :return: None
    """
    with open(name, 'wb') as f:
        pickle.dump(doc, f, pickle.HIGHEST_PROTOCOL)

def loadpkl(name='data.pickle'):
    """
    load .pickle file from work directory
    :param name:the filename of .pickle
    :return:the object in .pickle
    """
    with open(name, 'rb') as f:
        data = pickle.load(f)
    return data

def writetxt(name='corpus.txt', doc=None):
    if isinstance(doc, str) is False:
        raise RuntimeError('doc need be a str object')
    else:
        with open(name, 'r', encoding='utf-8', errors='ignore') as f:
            f.write(doc)
            return 1

def dataloader(keyword, timegap=None):
    """
    find raw data from mongodb
    :param keyword: the keyword of comment
    :param timegap: time gap of comment, ('2017-01-01', '2017-01-31')
    :return: the cursor of database
    """
    db = MongoClient()
    col = db.sina.weibo
    if not timegap:
        query = {'keyword':keyword}
        find = col.find(query)
    else:
        query = {'keyword': keyword,
                 'timestamp': {'$gte': timegap[0],
                               '$lte': timegap[1]}}
        find = col.find(query)
    return find

if __name__ == '__main__':

    db = MongoClient()
    col = db.sina.weibo
    find = col.find({'keyword':'大熊猫'})
    print('starting to merge...')
    res = mergecomment(cursor=find, merge=False, split="", punc = False)

    print('start to token...')
    cut_list = [list(jieba.cut(doc, cut_all=False)) for doc in res]

    print('starting to wipe out stop words...')
    cut_list_nonstop = list(map(remove_stop, cut_list))

    print('write result list')
    writepkl(res, name='panda_raw_list.pickle')
    writepkl(cut_list, name='panda_cut_list.pickle')
    writepkl(cut_list_nonstop, name='panda_cut_nonstop.pickle')