TF-IDF演算法抽取中文內容的主題關鍵詞
阿新 • • 發佈:2019-01-09
db.ini
# db
[db]
db_port = 3306
db_user = user
db_host = localhost
db_pass = pwd
db_database = db
main.py
# -*-coding:utf-8-*-
import MySQLdb
import configparser
import os
import jieba.posseg as pseg
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import sys
reload(sys)
sys.setdefaultencoding('utf8')
PATH = lambda p: os.path.abspath(os.path.join(os.path.dirname(__file__), p))
db_file = PATH('db.ini')
class IfTdf(object):
def init_db(self):
dbc = configparser.ConfigParser()
dbc.read(db_file)
self.conn = MySQLdb.connect(
host=dbc.get("db" , 'db_host'),
user=dbc.get("db", 'db_user'),
passwd=dbc.get("db", 'db_pass'),
db=dbc.get("db", 'db_database'),
port=int(dbc.get("db", 'db_port')),
charset='utf8')
self.cur = self.conn.cursor(MySQLdb.cursors.DictCursor)
def __init__ (self):
self.conn = None
self.cur = None
self.init_db()
def get_data(self):
self.cur.execute("SELECT id, content FROM `table` WHERE 1 ORDER BY `id` DESC LIMIT 1000")
return self.cur.fetchall()
def get_words(self, data):
stop_word = [unicode(line.rstrip()) for line in open(PATH('chinese_stopwords.txt'))]
for r in data:
content = r['content'].strip().replace('\n', '').replace(' ', '').replace('\t', '').replace('\r', '')
seg_list = pseg.cut(content)
seg_list_after = []
for seg in seg_list:
if seg.word not in stop_word:
seg_list_after.append(seg.word)
yield ' '.join(seg_list_after)
def get_ids(self, data):
for r in data:
yield '%s %s Topic:\n' % (r['id'], r['content'])
def __del__(self):
self.cur.close()
self.conn.close()
print 'Finished!'
def main(self):
data = self.get_data()
list_words = list(self.get_words(data))
list_ids = list(self.get_ids(data))
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(list_words))
words = vectorizer.get_feature_names()
weight = tfidf.toarray()
n = 3
for (id, w) in zip(list_ids, weight):
print u'{}:'.format(id)
loc = np.argsort(-w)
for i in range(n):
print u'-{}: {} {}'.format(str(i + 1), words[loc[i]], w[loc[i]])
print '\n'
IfTdf().main()
參考: