python---chinese text classification

阿新 • • 發佈：2018-02-15

detail os.path bytes nor post [] sea art fault

#http://blog.csdn.net/github_36326955/article/details/54891204#comments

#-*- coding: UTF-8 -*-

import importlib, sys
importlib.reload(sys)
#cnt = 1

"""
from lxml import html
def html2txt(path):
    with open(path,"rb") as f:
        content = f.read()
    page = html.document_fromstring(content)
    text = page.text_content()
    return text

if __name__ == "__main__":
    path = "test.htm"
    text = html2txt(path)
    print(text)
 
"""


"""
import jieba
seg_list = jieba.cut("我來到北京清華大學",cut_all=True)
print("Full Mode:"+"/".join(seg_list))

seg_list = jieba.cut("我來到北京清華大學",cut_all=False)
print("Default(Accurate) Mode:"+"/".join(seg_list))

seg_list = jieba.cut("他來到網易杭研大廈")
print(", ".join(seg_list))

seg_list = jieba.cut_for_search("小明碩士畢業於中國科學院計算所，後在日本京都大學深造") #搜索引擎模式
print(", ".join(seg_list))
 
"""


import os
import jieba
jieba.enable_parallel()
def savefile(path,content,_encode=‘utf-8‘):
    with open(path,‘w‘,encoding=_encode) as f:
        f.write(content)

def readfile(path,_encode=‘utf-8‘):
    with open(path,‘r‘,encoding=_encode, errors=‘ignore‘) as f:
        content = f.read()
     
return content



def preprocess(content,save_path):

    ‘‘‘
    global cnt
    if cnt == 1:
        print(type(content))
        print(content)
        cnt += 1
    ‘‘‘

    content = content.replace("\r\n","")
    content = content.replace(" ","")
    content_seg = jieba.cut(content)
    content_seg = " ".join(content_seg)
    ‘‘‘
    if cnt == 2:
        print(type(content_seg))
        cnt += 1
    ‘‘‘
    savefile(save_path,‘‘.join(content_seg))

def corpus_segment(corpus_path,seg_path):
    catelist = os.listdir(corpus_path)

    for subdir in catelist:
        class_path = os.path.join(corpus_path,subdir)
        #class_path = os.path.join(class_path,"")

        cur_seg_path = os.path.join(seg_path,subdir)
        #seg_path = os.path.join(seg_path,"")

        if not os.path.exists(cur_seg_path):
            os.makedirs(cur_seg_path)

        if ".DS_Store" not in class_path:
            file_list = os.listdir(class_path)

            for filename in file_list:
                file_path = os.path.join(class_path,filename)
                content = readfile(file_path,_encode=‘gbk‘)
                save_path = os.path.join(cur_seg_path,filename)
                preprocess(" ".join(content), save_path)

            print("中文語料分詞結束")

if __name__ == "__main__":
    corpus_path = "/Users/k/PycharmProjects/prac/train_corpus"
    seg_path = "/Users/k/PycharmProjects/prac/train_corpus_seg"
    corpus_segment(corpus_path,seg_path)


    corpus_path = "/Users/k/PycharmProjects/prac/test_corpus"
    seg_path = "/Users/k/PycharmProjects/prac/test_corpus_seg"
    corpus_segment(corpus_path,seg_path)

"""
from sklearn.datasets.base import Bunch
bunch = Bunch(target_name=[],lable=[],filenames=[],contents=[])
"""

import os
import pickle
from sklearn.datasets.base import Bunch

"""
‘_‘為了增強可讀性
"""


def _readfile(path):
    with open(path,"rb",) as f:
        content = f.read()
    return content

def corpus2Bunch(word_bag_path,seg_path):
    catelist = os.listdir(seg_path)
    bunch = Bunch(target_name=[],label=[],filename=[],contents=[])
    catelist = [x for x in catelist if "DS_Store" not in str(x) and "txt" not in str(x)]
    bunch.target_name.extend(catelist)
    for subdir in catelist:
        class_path = os.path.join(seg_path,subdir)
        #class_path = os.path.join(class_path,"")
        filename_list = os.listdir(class_path)
        for filename in filename_list:
            filepath = os.path.join(class_path,filename)
            bunch.label.append(subdir)
            bunch.filename.append(filepath)
            bunch.contents.append(_readfile(filepath)) #append bytes
    with open(word_bag_path,"wb") as file_obj:
        pickle.dump(bunch,file_obj)
    print("構建文本對象結束！")

if __name__ == "__main__":
    word_bag_path = "/Users/k/PycharmProjects/prac/train_word_bag/train_set.dat"
    seg_path = "/Users/k/PycharmProjects/prac/train_corpus_seg"
    corpus2Bunch(word_bag_path,seg_path)

    word_bag_path = "/Users/k/PycharmProjects/prac/test_word_bag/train_set.dat"
    seg_path = "/Users/k/PycharmProjects/prac/test_corpus_seg"
    corpus2Bunch(word_bag_path,seg_path)

python---chinese text classification

detail os.path bytes nor post [] sea art fault #http://blog.csdn.net/github_36326955/article/details/54891204#comments # #-*- coding:

Practical Text Classification With Python and Keras

Imagine you could know the mood of the people on the Internet. Maybe you are not interested in its entirety, but only if people are today happy on your

Learning Structured Representation for Text Classification via Reinforcement Learning 學習筆記

ctu recursive fec 註釋 css 進攻 imp column converge Representation learning ：表征學習，端到端的學習 pre-specified 預先指定的 demonstrate 論證;證明，證實;顯示

ICDAR2017 Competition on Reading Chinese Text in the Wild(RCTW-17) 介紹

閱讀文章：《ICDAR2017 Competition on Reading Chinese Text in the Wild(RCTW-17)》　　這篇文章是對一項中文檢測和識別比賽專案（RCTW）的介紹和總結，這是一項新的專注於中文識別的競賽。這項競賽的特點在於，包含12263張標註過的中文資料集，有

Investigating Capsule Networks with Dynamic Routing for Text Classification

探索使用動態路由的膠囊網路進行文字分類，提出三種策略穩定動態路由來減輕噪音膠囊的分佈，這些膠囊可能包含背景資訊，或是訓練不好。膠囊網路獲得很好的分類效果，而且訓練多標籤的效果好於單標籤 1 Introduction 文章或是句子建模是NLP的基礎問題，如果組成，層次，結構都考慮的話，很是複雜

Chinese Text in the Wild 學習筆記

CTW資料集下載地址： CTW dataset Download from one of the following links. 騰訊微雲https://share.weiyun.com/50hF1Cc OneDrivehttps://1drv.ms/f/s!Al-inEPeCze

fasttext論文 Bag of Tricks for Efficient Text Classification

fasttext: Bag of Tricks for Efficient Text Classification Abstract 1 Introduction 2 Model architecture 2.1 Hiera

Python讀取text後，指令碼執行，發起兩筆

f1 = open("E:\\zxtest\\ddpush.txt", 'r') lines = f1.readlines() #讀取全部內容，並以列表方式返回 print lines for price in lines: print price, time.

《Character-level convolutional networks for text classification》論文網路結構解讀

1.資料比如有一條資料【x=“Simultaneous Tropical Storms are Very Rare”】.則把該句子的大寫字母全部表示成小寫，構建char字符集的詞彙表如下（這裡詞彙表長度為70（69+1，即其他的不在詞彙表的表示為0））：資料可以表示為x=70X

tf.keras入門(2) Film review text Classification（IMDB 資料集）

影評文字分類使用 IMDB 資料集，其中包含來自網際網路電影資料庫的 50000 條影評文字。將這些影評拆分為訓練集（25000 條影評）和測試集（25000 條影評）。訓練集和測試集之間達成了平衡，意味著它們包含相同數量的正面和負面影評。介面解釋 train_

Recurrent Neural Network for Text Classification with Multi-Task Learning

引言 Pengfei Liu等人在2016年的IJCAI上發表的論文，論文提到已存在的網路都是針對單一任務進行訓練，但是這種模型都存在問題，即缺少標註資料，當然這是任何機器學習任務都面臨的問題。為了應對資料量少，常用的方法是使用一個無監督的預訓練模型，比如詞向量，實驗中也取得了不錯

python xpath text與attrib

xPath = '/html/body/div/div/form/div/table/tbody/tr/td[1]/a' 我們很容易就能抓到一個這樣的a標籤陣列然後安排它 selector =

Week1.3 Simple deep learning for text classification

Neural networks for words（and characters) 在本節中我們將學習如何將神經網路用於文字分類，還將學習卷積神經網路相關的原理. 回顧–Bag of words way 在前面課程中，我們學習瞭如何將一段文本當作一系列word

[Python] send text to the Mac clipboard

pip install clipboard A cross platform clipboard operation library of Python. Works for Windows, Mac and Linux. Well, as I was trying to implement this,

Python - Sublime Text 3 控制檯輸出中文亂碼的解決方案

工具 -> 編譯系統 -> 新編譯系統 { "cmd": ["python","-u","$file"], "file_regex": "^[ ]*File \"(...*?)\", line ([0-9]*)", "selector"

Text Classification

Deep neural networks are gaining increasing popularity for the classic text classification task, due to their strong expressive power and less requirement

Class Text Classification Model Comparison and Selection

Way better!df['post'].apply(lambda x: len(x.split(' '))).sum()3421180After text cleaning and removing stop words, we have only over 3 million words to work

Using Feature Selection Methods in Text Classification

In text classification, the feature

《Universal Language Model Fine-tuning for Text Classification》翻譯

ULMFiT-用於文字分類的通用語言模型微調翻譯自《Universal Language Model Fine-tuning for Text Classification》 Jeremy Howard* fast.ai | University of San F

Text Classification using Neural Networks

Text Classification using Neural NetworksUnderstanding how chatbots work is important. A fundamental piece of machinery inside a chat-bot is the text class

python---chinese text classification

相關推薦