python 提取程式碼中的所有漢字

阿新 • • 發佈：2020-12-28

技術標籤：python

遇到一個需求，需要提取程式碼中所有用到的漢字，有lua程式碼c++程式碼還有oc程式碼，於是研究了一個指令碼，專門提取程式碼中的漢字，現在研究好了，在這裡貼一下，供大家參考

# -*- coding: UTF-8 -*-
import os

strStr = []
suf_set = ('.lua','.cpp','.h','.hpp','.m','.mm')
isFilterEnglish = 0 #是否過濾英文
#指定目錄下 (可修改)
filePathC = "C:\\Users\\panyafei\\Desktop\\test\\src"
#寫入指定檔案 (可修改)
saveFilePath = 'C:\\Users\\panyafei\\Desktop\\test\\666.txt'
#寫入文字名稱 (可修改)
saveName = "words.txt" #預設儲存檔名

# 遍歷指定目錄，顯示目錄下的所有檔名
def eachFile(filepath,saveFilePath):
    for root, dirs, files in os.walk(filepath):
        for file in files:
            luaFileName = os.path.join(root, file)
            if luaFileName.endswith(suf_set):
                readFile(luaFileName,saveFilePath)

# 儲存字串到文字中
def save_to_file(file_name, contents):
    str = ""
    content = ""
    if isFilterEnglish == 1:
        for char in contents:
            if ord(char)>=65 and ord(char)<90 or ord(char)>=97 and ord(char)<122:
                print("過濾英文")
            else:
                content = content + char
    else:
        content = contents

    f = open(file_name, 'r')
    str = f.read()
    f.close()

    fh = open(file_name, 'w')
    if str == "":
        fh.write(str + content)
    else:
        fh.write(str+ '\n' +content)
    fh.close()

# 讀取檔案內容並列印
def readFile(filename,saveFilePath):
    # 搜尋以下檔案型別
    isPiZS = 0  # 批註釋
    f = open(filename)  # 返回一個檔案物件
    line = f.readline()  # 呼叫檔案的 readline()方法
    while line:
        line = line.lstrip()
        line = line.lstrip('\t')
        #print("line = "+line)  # 後面跟 ',' 將忽略換行符
        l = len(line)
        if l>0:
            #如果是批註釋的話不管是不是中文字元都不管
            index = filename.find(".lua")
            if judgeIsZhiShiBegan(filename, line):
                isPiZS = 1

            #如果是批註釋中，那麼就找到結尾，並且把結尾後的字元截取出來
            if isPiZS == 1 :
                pos = 0
                isfound,pos = judgeIsZhiShiEnd(filename,line)
                if isfound == 1:
                    if pos+2<l-1:
                        line = line[pos+2:l-1]
                    else:
                        line = "\n"
                    isPiZS = 0
                    l = len(line)

            if judgeIsZhuShi(filename,line) == 1 or line[0] == '\n' or isPiZS != 0 :
                print("過濾註釋！")
            else:
                findChinaStr(line,saveFilePath)

        line = f.readline()

    f.close()

#檢測這是否是一個註釋行
def judgeIsZhuShi(filename,str):
    value = filename.find(".lua")
    pos = 0 #表示的是生效開始下表位置
    lens = len(str)
    isZhuShi = 0
    # pp = ord(' ')
    for num in range(0,lens-1):  # 迭代 0 到 len 之間的數字
        if str[num] !=' ':
            pos = num
            break
    if value>0: #lua檔案
        if str[pos] == '-' and (lens-pos) >= 2 and str[pos+1] == '-':
            isZhuShi = 1
    else:
        if ord(str[pos]) == 47 and (lens-pos) >= 2 and ord(str[pos]) == 47:
            isZhuShi = 1

    return isZhuShi

#檢測批註釋的開始
def judgeIsZhiShiBegan(filename,str):
    value = filename.find(".lua")
    pos = 0  # 表示的是生效開始下表位置
    lens = len(str)
    isZhuShi = 0
    for num in range(0, lens - 1):  # 迭代 0 到 len 之間的數字
        if str[num] != ' ':
            pos = num
            break
    if value > 0:  # lua檔案
        if str[pos] == '-' and (lens - pos) >= 4 and str[pos+1] == '-' and str[pos+2] == '[' and str[pos+3] == '[':
            isZhuShi = 1
    else:
        if ord(str[pos]) == 47 and (lens - pos) >= 2 and ord(str[pos+1]) == 42:
            isZhuShi = 1

    return isZhuShi

#檢測批註釋的結尾
def judgeIsZhiShiEnd(filename,str):
    value = filename.find(".lua")
    pos = 0  # 表示的是生效開始下表位置
    lens = len(str)
    isZhuShi = 0
    for num in range(0, lens - 1):  # 迭代 0 到 len 之間的數字
        if str[num] != ' ':
            pos = num
            break

    if value > 0:
        if str.find('\]'):
            pos = str.find(']')
            if pos != -1 and lens >= pos + 2 and str[pos + 1] == ']':
                # print('找到結尾的批註釋！')
                isZhuShi = 1
    else:
        for num in range(0, lens - 1):
            if ord(str[num]) == 42 and num+1<lens and ord(str[num+1]) == 47:
                # print('找到結尾的批註釋！')
                pos = num
                isZhuShi = 1
                break

    return isZhuShi,pos

def findChinaStr(str,saveFilePath):
    chinese = ""
    dataLen = len(str)
    i = 0
    while i < dataLen:
        value = ord(str[i])
        if value == 34 and i + 1 < dataLen:
            i = i + 1
            while ord(str[i]) != 34 and i + 1 < dataLen:
                chinese = chinese + str[i]
                i = i + 1
            if isCanShow(chinese) == True and isCanSave(chinese)==1:
                strStr.append(chinese)
                save_to_file(saveFilePath, chinese)
                print(chinese.decode('utf-8').encode('gbk'))
            chinese = ""
        i = i + 1;

def isCanSave(chinese):
    for str in strStr:
        if str == chinese:
            return 0
    return 1


# 全部ASCII碼，不需要顯示
def isCanShow(str):
    flag = False
    tick = 0
    for cha in str:
        value = ord(cha)
        if value <= 127:
            tick = tick + 1
    if tick == len(str):
        return False
    return True

if __name__ == '__main__':
    if filePathC == "" or (os.path.exists(filePathC) == False):
        str = "未設定路徑或者路徑不存在,是否預設當前路徑,按任意鍵繼續，退出請關閉！"
        print(str.decode('utf-8').encode('gbk'))
        os.system("pause")
        filePathC = os.getcwd()

    if saveFilePath == "" :
        path = os.path.abspath(os.path.dirname(filePathC))
        saveFilePath = path + "\\" + saveName

    if os.path.exists(saveFilePath):
        print('檔案存在,清空內容！')
        f = open(saveFilePath, "r+")
        f.truncate()
    else:
        print('檔案不存在，建立檔案')
        file = open(saveFilePath, 'w')
        file.close()

    eachFile(filePathC,saveFilePath)

    print("--------------------finish--------------------")
    os.system("pause")

指令碼中可以自己設定需要查詢的路徑和儲存文字的名字，如果沒有設定路徑的話會預設指向當前路徑，

同時也支援設定文字型別，暫時支援

'.lua','.cpp','.h','.hpp','.m','.mm'

這六種型別，後期可以自己增加刪除檔案型別。

isFilterEnglish 可以設定是否過濾中文中間夾雜的英文字元，預設是不過濾的，如有需求可以自行修改 0->表示不過濾   1->表示過濾

還有一點需要說明，就是這個儲存的文字型別，我這裡寫的是.txt格式的,如果你需要的是一個表格形式的文字，那麼只要把saveName = "words.txt" 改成saveName = "words.xls" 或者saveName = "words.xlsx" 即可。

好了，到這裡就結束了，此指令碼如有更新，會自動上傳更新！

python 提取程式碼中的所有漢字

技術標籤：python 遇到一個需求，需要提取程式碼中所有用到的漢字，有lua程式碼c++程式碼還有oc程式碼，於是研究了一個指令碼，專門提取程式碼中的漢字，現在研究好了，在這裡貼一下，供大家參考

Python 捕獲程式碼中所有異常的方法

問題怎樣捕獲程式碼中的所有異常？解決方案想要捕獲所有的異常，可以直接捕獲 Exception 即可：

python提取檔案中所有ip

引用正則表示式 import re, socket ip_lists = []ip_regex = re.compile(r\'(25[0-5]|2[0-4][0-9]|[0-1]?[0-9]?[0-9])\\.(25[0-5]|2[0-4][0-9]|[0-1]?[0-9]?[0-9])\\.(25[0-5]|2[0-4][0-9]|[0-1]?[0-9]?[0-9])\\.(25

Java提取字串中的漢字、字母、數字

1.提取漢字 public static void main(String[] args) { String str = \" 我是123一段測abd試空a格的字元串 \";

Python提取視訊中圖片的示例（按幀、按秒）

一、按幀提取 #coding=utf-8 import os import cv2 def save_img():#提取視訊中圖片按照每幀提取

使用Python提取文字中含有特定字串的方法示例

今天搞了一天的文字處理，發現python真的太適合做資料處理了。廢話不多說，一起學習吧！

python檢視陣列中所有元素是否相同

技術標籤：python科學計算pythonnumpy 不知道大家有沒有過這種經歷，就是想要判斷兩個陣列運算後得到的新陣列中的各個元素值是否相同。這裡給出一種使用np.unique()的方法，程式碼如下：

python 將列表中所有資料取反_Python資料結構中的列表

技術標籤：python 將列表中所有資料取反 @Author：Runsen 資料結構 python有三種內建的資料結構：列表、元組和字典。

這款Python庫4行程式碼提取圖片中的文字

OCR是什麼？有一款軟體叫掃描全能王，想必一些小夥伴聽過，這是一個OCR整合軟體，可以將影象內容掃描成文字。

試試 python-dotenv，避免敏感資訊被硬編碼到程式碼中

我們開發的每個系統都離不開配置資訊，例如資料庫密碼、Redis密碼、郵件配置、各種第三方配置資訊，這些資訊都非常敏感，一旦洩露出去後果非常嚴重，被洩露的原因一般是程式設計師將配置資訊和程式碼混在一起導致的。

python輸出陣列中指定元素的所有索引示例

如下所示，程式碼為： array也可直接使用上面程式碼。測試如下：以上這篇python輸出陣列中指定元素的所有索引示例就是小編分享給大家的全部內容了，希望能給大家一個參考，也希望大家多多支援我們。

python [:3] 實現提取陣列中的數

搜尋答案搜尋不到，自己試了一把．首先生成一維陣列 a =np.array([1,2,3,4,5,6,7,8,9])

Python Opencv提取圖片中某種顏色組成的圖形的方法

主要目標識別圖中紅色的裂縫，嘗試了幾種不同的方法，最後發現比較每一點的RGB差值可以很好的解決這個問題，也就是提取圖片中的紅色相關資訊。處理結果如下：

Python如何將影象音視訊等資原始檔隱藏在程式碼中(小技巧)

下午有同學Python學習群裡說，使用pyinstaller打包原始碼時，因為程式碼中使用了影象、音訊、視訊等資原始檔，無法將程式打包成單一的可執行檔案。有沒有方法將這些資原始檔按儲存在程式碼中呢？我想了一下，應該是可

python實現訊號時域統計特徵提取程式碼

1.實驗資料需求為了對採集的壓力實驗資料做特徵工程，需要對訊號進行時域的統計特徵提取，包含了均值、均方根、偏度、峭度、波形因子、波峰因子、脈衝因子、峭度因子等，現用python對其進行實現。

python統計字串中字母出現次數程式碼例項

程式碼如下 dic=dict() d={} s=set() s=\'helloworld\' （1）d=dict() for x in s: 　　if x not in d.keys():

Python object類中的特殊方法程式碼講解

python版本：3.8 class object: \"\"\" The most base type \"\"\" # del obj.xxx或delattr(obj,\'xxx\')時被呼叫，刪除物件中的一個屬性

Python程式碼中如何讀取鍵盤錄入的值

讀取鍵盤輸入 Python提供了兩個內建函式從標準輸入讀入一行文字，預設的標準輸入是鍵盤。如下：

python程式碼中怎麼換行

在寫程式碼過程中，經常遇到一行程式碼很長的情況。為了讓程式碼顯得整齊乾淨，就需要把一行程式碼分成多行來寫，Python中有兩種小技巧可以實現該功能：

win10系統下怎麼批量提取資料夾中所有檔名稱

在使用win10系統的過程中，有時候需要獲取資料夾中所有檔案的名稱，而那個資料夾中的檔案又太多的話，如果一個個複製太麻煩了，那麼其實我們可以建立一個簡單的指令碼命令來批量提取資料夾中所有檔名稱，下面給大家分

python 提取程式碼中的所有漢字

相關推薦