1. 程式人生 > >python---chinese text classification

python---chinese text classification

detail os.path bytes nor post [] sea art fault

#http://blog.csdn.net/github_36326955/article/details/54891204#comments

#

#-*- coding: UTF-8 -*-

import importlib, sys
importlib.reload(sys)
#cnt = 1

"""
from lxml import html
def html2txt(path):
    with open(path,"rb") as f:
        content = f.read()
    page = html.document_fromstring(content)
    text = page.text_content()
    return text

if __name__ == "__main__":
    path = "test.htm"
    text = html2txt(path)
    print(text)
""" """ import jieba seg_list = jieba.cut("我來到北京清華大學",cut_all=True) print("Full Mode:"+"/".join(seg_list)) seg_list = jieba.cut("我來到北京清華大學",cut_all=False) print("Default(Accurate) Mode:"+"/".join(seg_list)) seg_list = jieba.cut("他來到網易杭研大廈") print(", ".join(seg_list)) seg_list = jieba.cut_for_search("小明碩士畢業於中國科學院計算所,後在日本京都大學深造") #搜索引擎模式 print(", ".join(seg_list))
""" import os import jieba jieba.enable_parallel() def savefile(path,content,_encode=utf-8): with open(path,w,encoding=_encode) as f: f.write(content) def readfile(path,_encode=utf-8): with open(path,r,encoding=_encode, errors=ignore) as f: content = f.read()
return content def preprocess(content,save_path): ‘‘‘ global cnt if cnt == 1: print(type(content)) print(content) cnt += 1 ‘‘‘ content = content.replace("\r\n","") content = content.replace(" ","") content_seg = jieba.cut(content) content_seg = " ".join(content_seg) ‘‘‘ if cnt == 2: print(type(content_seg)) cnt += 1 ‘‘‘ savefile(save_path,‘‘.join(content_seg)) def corpus_segment(corpus_path,seg_path): catelist = os.listdir(corpus_path) for subdir in catelist: class_path = os.path.join(corpus_path,subdir) #class_path = os.path.join(class_path,"") cur_seg_path = os.path.join(seg_path,subdir) #seg_path = os.path.join(seg_path,"") if not os.path.exists(cur_seg_path): os.makedirs(cur_seg_path) if ".DS_Store" not in class_path: file_list = os.listdir(class_path) for filename in file_list: file_path = os.path.join(class_path,filename) content = readfile(file_path,_encode=gbk) save_path = os.path.join(cur_seg_path,filename) preprocess(" ".join(content), save_path) print("中文語料分詞結束") if __name__ == "__main__": corpus_path = "/Users/k/PycharmProjects/prac/train_corpus" seg_path = "/Users/k/PycharmProjects/prac/train_corpus_seg" corpus_segment(corpus_path,seg_path) corpus_path = "/Users/k/PycharmProjects/prac/test_corpus" seg_path = "/Users/k/PycharmProjects/prac/test_corpus_seg" corpus_segment(corpus_path,seg_path) """ from sklearn.datasets.base import Bunch bunch = Bunch(target_name=[],lable=[],filenames=[],contents=[]) """

#

import os
import pickle
from sklearn.datasets.base import Bunch

"""
‘_‘為了增強可讀性
"""


def _readfile(path):
    with open(path,"rb",) as f:
        content = f.read()
    return content

def corpus2Bunch(word_bag_path,seg_path):
    catelist = os.listdir(seg_path)
    bunch = Bunch(target_name=[],label=[],filename=[],contents=[])
    catelist = [x for x in catelist if "DS_Store" not in str(x) and "txt" not in str(x)]
    bunch.target_name.extend(catelist)
    for subdir in catelist:
        class_path = os.path.join(seg_path,subdir)
        #class_path = os.path.join(class_path,"")
        filename_list = os.listdir(class_path)
        for filename in filename_list:
            filepath = os.path.join(class_path,filename)
            bunch.label.append(subdir)
            bunch.filename.append(filepath)
            bunch.contents.append(_readfile(filepath)) #append bytes
    with open(word_bag_path,"wb") as file_obj:
        pickle.dump(bunch,file_obj)
    print("構建文本對象結束!")

if __name__ == "__main__":
    word_bag_path = "/Users/k/PycharmProjects/prac/train_word_bag/train_set.dat"
    seg_path = "/Users/k/PycharmProjects/prac/train_corpus_seg"
    corpus2Bunch(word_bag_path,seg_path)

    word_bag_path = "/Users/k/PycharmProjects/prac/test_word_bag/train_set.dat"
    seg_path = "/Users/k/PycharmProjects/prac/test_corpus_seg"
    corpus2Bunch(word_bag_path,seg_path)

python---chinese text classification