1. 程式人生 > >python 爬蟲練習

python 爬蟲練習

utf 組裝 float request splay isp -o values align

bs去除特定標簽。

# url
import easygui as g
import urllib.request
from bs4 import BeautifulSoup
import os
import sys
import re
import config.story2 as urls

# 獲取url
def set_url():

    msg = "請填寫一下信息(其中帶*號的項為必填項)"
    title = "爬蟲練習"
    fieldNames = ["*小說目錄地址", "*組裝前半段", "後半段"]
    fieldValues 
= [] fieldValues = g.multenterbox(msg, title, fieldNames) while True: if fieldValues == None: break errmsg = "" for i in range(len(fieldNames)): option = fieldNames[i].strip() if fieldValues[i].strip() == "" and option[0] == "
*": errmsg += ("【%s】為必填項 " % fieldNames[i]) if errmsg == "": break fieldValues = g.multenterbox(errmsg, title, fieldNames, fieldValues) return fieldValues # 下載網頁內容,找到文章標題和對應的下載路徑 def get_urls(seed_url,pre_url,last_url): # 保存文章名稱和地址 storyList = {} response
= urllib.request.urlopen(seed_url) html = response.read().decode(utf-8) bs = BeautifulSoup(html, "html.parser") contents = bs.find_all("div", {"class": "c-line-bottom"}) for each in contents: # 或者文章的data-nsrc屬性 nsrc = each.a["data-nsrc"] #組裝url seed_url = pre_url+nsrc+last_url # 獲取文件標題 title = each.p.string storyList[title] = seed_url return storyList # 獲取每個小說並下載 def getStory(): savepath = r"E:\\stories\\" storyList = get_urls(urls.url1,urls.url2,urls.url3) storyNames = list(storyList.keys()) for i in range(len(storyNames)): # 獲取小說: html = urllib.request.urlopen(storyList[storyNames[i]]).read().decode(utf-8) bs = BeautifulSoup(html,"html.parser") [s.extract() for s in bs(br)] # 後來發現這個可以啊 content = bs.find_all(p) #[ss.extract() for ss in content(‘p‘)] # 放到這裏是否可以,發現不行。TypeError: ‘ResultSet‘ object is not callable # # 用替換方式去掉br修飾,發現不行 # oldstr = r‘<br style="font-size:16px;font-weight:normal;‘ \ # r‘margin-left:4px;margin-right:4px;float:none;color:rgb(0, 0, 0);‘ \ # r‘text-align:-webkit-auto;text-indent:0px;white-space:normal;‘ \ # r‘text-overflow:clip;clear:none;display:inline;"/>‘ # # print(content) with open(savepath+storyNames[i]+".txt",w) as f: f.writelines(str(content)) # download(get_url()) # get_url() getStory()

python 爬蟲練習