1. 程式人生 > >第4.3章 request爬取小學3000詞語

第4.3章 request爬取小學3000詞語

爬蟲並不是一定要用scrapy框架,下面介紹的這個就是通過requests直接獲取的,程式碼如下 生成田字格的程式碼參考第4.1章給小朋友寫的飛鳥集打亂後組詞的爬蟲

import requests
import os
import re
from pyquery import PyQuery as pq
from word_deal.primary_spelling import to_doc,duplicate_removal

OUT_PATH = 'G:\\dzmfile\\pythonwork\\small_routine\\others\\out\\'

def gen_yuwen_txt
(xx_name): r = requests.get('http://k.sina.com.cn/article_6429307123_17f3770f30010033nv.html?from=baby') soup = pq(r.content) lines = soup('#artibody p>font') paras = [] for line in lines: paras.append(pq(line).text()) # 過濾包含數字的,因為文章中包含數字的,才是真實有效的數字 paras = filter(lambda
x:re.findall('\d',x),paras) file = open(xx_name,'w',encoding='utf-8') for para in paras: # 有些錯別字的需要糾正 para = para.replace('識字','') file.writelines(para+'\n') file.close() def get_lines(xx_name): lines = [] file = open(xx_name,encoding='utf-8') lines = file
.readlines() file.close() return lines def gen_by_nianji(xx_name): lines = get_lines(xx_name) # 根據關鍵字獲取索引,才好匹配出對應生字內容 # 一年級 up_index_1 = lines.index('一年級上冊生字: 100個\n') down_index_1 = lines.index('一年級下冊生字:250個\n') # 二年級 up_index_2 = lines.index('二年級上冊生字:350個\n') down_index_2 = lines.index('二年級下冊生字:300個\n') # 三年級 up_index_3 = lines.index('三年級上冊生字300個\n') down_index_3 = lines.index('三年級下冊生字300個\n') # 四年級 up_index_4 = lines.index('四年級上冊生字200個\n') down_index_4 = lines.index('四年級下冊生字200個\n') # 五年級 up_index_5 = lines.index('五年級上冊生字150個\n') down_index_5 = lines.index('五年級下冊生字150個\n') # 六年級 up_index_6 = lines.index('六年級上冊生字80個\n') down_index_6 = lines.index('六年級下冊生字80個\n') # 逐年生成 words10 = get_words(lines[up_index_1:down_index_1]) words11 = get_words(lines[down_index_1:up_index_2]) words20 = get_words(lines[up_index_2:down_index_2]) words21 = get_words(lines[down_index_2:up_index_3]) words30 = get_words(lines[up_index_3:down_index_3]) words31 = get_words(lines[down_index_3:up_index_4]) words40 = get_words(lines[up_index_4:down_index_4]) words41 = get_words(lines[down_index_4:up_index_5]) words50 = get_words(lines[up_index_5:down_index_5]) words51 = get_words(lines[down_index_5:up_index_6]) words60 = get_words(lines[up_index_6:down_index_6]) words61 = get_words(lines[down_index_6:]) to_pinyin(words10, '一年級上冊') to_pinyin(words11, '一年級下冊') to_pinyin(words20, '二年級上冊') to_pinyin(words21, '二年級下冊') to_pinyin(words30, '三年級上冊') to_pinyin(words31, '三年級下冊') to_pinyin(words40, '四年級上冊') to_pinyin(words41, '四年級下冊') to_pinyin(words50, '五年級上冊') to_pinyin(words51, '五年級下冊') to_pinyin(words60, '六年級上冊') to_pinyin(words61, '六年級下冊') def get_words(lines): # 過濾出以數字開頭的內容 lines = filter(lambda x:re.match(r'^\d',x),lines) words = [] for line in lines: # 通過下面的語句過濾出包含中文字的內容 m = re.findall(r'[\u4e00-\u9fa5]+',line) words.append(str(m)) return words def to_pinyin(paragraphs,file_name): words = duplicate_removal(paragraphs) file_name = file_name+'.docx' to_doc(list(words),file_name) if __name__ == '__main__': xx_name = 'G:\\dzmfile\\pythonwork\\small_routine\\others\\in\\xx.txt' gen_yuwen_txt(xx_name) gen_by_nianji(xx_name)