1. 程式人生 > >TF-IDF演算法抽取中文內容的主題關鍵詞

TF-IDF演算法抽取中文內容的主題關鍵詞

db.ini

# db
[db]
db_port = 3306
db_user = user
db_host = localhost
db_pass = pwd
db_database = db

main.py

# -*-coding:utf-8-*-

import MySQLdb
import configparser
import os
import jieba.posseg as pseg

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import
CountVectorizer import numpy as np import sys reload(sys) sys.setdefaultencoding('utf8') PATH = lambda p: os.path.abspath(os.path.join(os.path.dirname(__file__), p)) db_file = PATH('db.ini') class IfTdf(object): def init_db(self): dbc = configparser.ConfigParser() dbc.read(db_file) self.conn = MySQLdb.connect( host=dbc.get("db"
, 'db_host'), user=dbc.get("db", 'db_user'), passwd=dbc.get("db", 'db_pass'), db=dbc.get("db", 'db_database'), port=int(dbc.get("db", 'db_port')), charset='utf8') self.cur = self.conn.cursor(MySQLdb.cursors.DictCursor) def __init__
(self):
self.conn = None self.cur = None self.init_db() def get_data(self): self.cur.execute("SELECT id, content FROM `table` WHERE 1 ORDER BY `id` DESC LIMIT 1000") return self.cur.fetchall() def get_words(self, data): stop_word = [unicode(line.rstrip()) for line in open(PATH('chinese_stopwords.txt'))] for r in data: content = r['content'].strip().replace('\n', '').replace(' ', '').replace('\t', '').replace('\r', '') seg_list = pseg.cut(content) seg_list_after = [] for seg in seg_list: if seg.word not in stop_word: seg_list_after.append(seg.word) yield ' '.join(seg_list_after) def get_ids(self, data): for r in data: yield '%s %s Topic:\n' % (r['id'], r['content']) def __del__(self): self.cur.close() self.conn.close() print 'Finished!' def main(self): data = self.get_data() list_words = list(self.get_words(data)) list_ids = list(self.get_ids(data)) vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(list_words)) words = vectorizer.get_feature_names() weight = tfidf.toarray() n = 3 for (id, w) in zip(list_ids, weight): print u'{}:'.format(id) loc = np.argsort(-w) for i in range(n): print u'-{}: {} {}'.format(str(i + 1), words[loc[i]], w[loc[i]]) print '\n' IfTdf().main()

參考: