對語料庫的每一個句子的每一個單詞加權重
阿新 • • 發佈:2019-02-13
包括預處理,使用tfidf加權重
#!/usr/bin/env python # -*- coding: utf-8 -*- # created by fhqplzj on 2017/05/15 上午10:48 import itertools import re import jieba from six.moves import xrange from sklearn.feature_extraction.text import TfidfVectorizer def load_stopwords(): path = '/Users/fhqplzj/PycharmProjects/data_service/service/dic/why/stopwords' content = open(path, 'rb').read().decode('utf-8') return frozenset(content.splitlines()) stopwords = load_stopwords() chinese = re.compile(ur'^[0-9a-zA-Z_\u4e00-\u9fa5]+$') def filter_func(word): result = True if re.match(chinese, word) else False return result and word not in stopwords def my_tokenizer(sentence): words = jieba.lcut(sentence) return filter(filter_func, words) def word_and_weight(corpus): vectorizer = TfidfVectorizer(tokenizer=my_tokenizer, norm='l1') tfidf_matrix = vectorizer.fit_transform(corpus) for row_idx in xrange(len(corpus)): pairs = [] for word in my_tokenizer(corpus[row_idx]): try: weight = tfidf_matrix[row_idx, vectorizer.vocabulary_.get(word)] except IndexError: weight = 0.0 pairs.append((word, weight)) yield pairs def load_corpus(): path = '/Users/fhqplzj/PycharmProjects/data_service/service/dic/why/why' content = open(path, 'rb').read().decode('utf-8') lines = [] for line in content.splitlines(): try: lines.append(line.split('\t')[1]) except IndexError: pass return lines for pair in itertools.islice(word_and_weight(load_corpus()), 1250): for two in pair: print two[0], two[1] print