第4.3章 request爬取小學3000詞語
阿新 • • 發佈:2018-12-20
爬蟲並不是一定要用scrapy框架,下面介紹的這個就是通過requests直接獲取的,程式碼如下 生成田字格的程式碼參考第4.1章給小朋友寫的飛鳥集打亂後組詞的爬蟲
import requests
import os
import re
from pyquery import PyQuery as pq
from word_deal.primary_spelling import to_doc,duplicate_removal
OUT_PATH = 'G:\\dzmfile\\pythonwork\\small_routine\\others\\out\\'
def gen_yuwen_txt (xx_name):
r = requests.get('http://k.sina.com.cn/article_6429307123_17f3770f30010033nv.html?from=baby')
soup = pq(r.content)
lines = soup('#artibody p>font')
paras = []
for line in lines:
paras.append(pq(line).text())
# 過濾包含數字的,因為文章中包含數字的,才是真實有效的數字
paras = filter(lambda x:re.findall('\d',x),paras)
file = open(xx_name,'w',encoding='utf-8')
for para in paras:
# 有些錯別字的需要糾正
para = para.replace('識字','')
file.writelines(para+'\n')
file.close()
def get_lines(xx_name):
lines = []
file = open(xx_name,encoding='utf-8')
lines = file .readlines()
file.close()
return lines
def gen_by_nianji(xx_name):
lines = get_lines(xx_name)
# 根據關鍵字獲取索引,才好匹配出對應生字內容
# 一年級
up_index_1 = lines.index('一年級上冊生字: 100個\n')
down_index_1 = lines.index('一年級下冊生字:250個\n')
# 二年級
up_index_2 = lines.index('二年級上冊生字:350個\n')
down_index_2 = lines.index('二年級下冊生字:300個\n')
# 三年級
up_index_3 = lines.index('三年級上冊生字300個\n')
down_index_3 = lines.index('三年級下冊生字300個\n')
# 四年級
up_index_4 = lines.index('四年級上冊生字200個\n')
down_index_4 = lines.index('四年級下冊生字200個\n')
# 五年級
up_index_5 = lines.index('五年級上冊生字150個\n')
down_index_5 = lines.index('五年級下冊生字150個\n')
# 六年級
up_index_6 = lines.index('六年級上冊生字80個\n')
down_index_6 = lines.index('六年級下冊生字80個\n')
# 逐年生成
words10 = get_words(lines[up_index_1:down_index_1])
words11 = get_words(lines[down_index_1:up_index_2])
words20 = get_words(lines[up_index_2:down_index_2])
words21 = get_words(lines[down_index_2:up_index_3])
words30 = get_words(lines[up_index_3:down_index_3])
words31 = get_words(lines[down_index_3:up_index_4])
words40 = get_words(lines[up_index_4:down_index_4])
words41 = get_words(lines[down_index_4:up_index_5])
words50 = get_words(lines[up_index_5:down_index_5])
words51 = get_words(lines[down_index_5:up_index_6])
words60 = get_words(lines[up_index_6:down_index_6])
words61 = get_words(lines[down_index_6:])
to_pinyin(words10, '一年級上冊')
to_pinyin(words11, '一年級下冊')
to_pinyin(words20, '二年級上冊')
to_pinyin(words21, '二年級下冊')
to_pinyin(words30, '三年級上冊')
to_pinyin(words31, '三年級下冊')
to_pinyin(words40, '四年級上冊')
to_pinyin(words41, '四年級下冊')
to_pinyin(words50, '五年級上冊')
to_pinyin(words51, '五年級下冊')
to_pinyin(words60, '六年級上冊')
to_pinyin(words61, '六年級下冊')
def get_words(lines):
# 過濾出以數字開頭的內容
lines = filter(lambda x:re.match(r'^\d',x),lines)
words = []
for line in lines:
# 通過下面的語句過濾出包含中文字的內容
m = re.findall(r'[\u4e00-\u9fa5]+',line)
words.append(str(m))
return words
def to_pinyin(paragraphs,file_name):
words = duplicate_removal(paragraphs)
file_name = file_name+'.docx'
to_doc(list(words),file_name)
if __name__ == '__main__':
xx_name = 'G:\\dzmfile\\pythonwork\\small_routine\\others\\in\\xx.txt'
gen_yuwen_txt(xx_name)
gen_by_nianji(xx_name)