分享一些關於Python爬蟲的原始碼,需要的朋友可以自行領取
阿新 • • 發佈:2018-11-25
利用Python批量下載百度圖片
# !/usr/bin/env python # -*- coding:utf-8 -*- # 匯入URLLIB庫的編碼方法 from urllib.parse import urlencode # 請求庫 import requests # 用於處理Json格式的檔案 import json # 函式作用,拼接完整的URL def page_url_cont(): # 可以傳入2個變數,一個用於搜尋關鍵字,一個用於翻頁 # queryWord word 對應關鍵字 # pn 對應翻頁 data = {'tn': 'resultjson_com', 'ipn': 'rj', 'ct': '201326592', 'is': '', 'fp': 'result', 'queryWord': '美女', 'cl': 2, 'lm': -1, 'ie': 'utf-8', 'oe': 'utf-8', 'adpicid':'', 'st': '-1', 'z': '', 'ic': 0, 'word': '美女', 's': '', 'se': '', 'tab': '', 'width': '', 'height': '', 'face': 0, 'istype': 2, 'qc': '', 'nc': 1, 'fr': '', 'pn': 0, 'rn': 30, 'gsm': 96 } # 拼接url url = 'http://image.baidu.com/search/index?' + urlencode(data) # 返回拼接好的url return url # 獲取圖片連結返回一個列表 def get_images_url(response): # 用jsondata儲存 用json.loads的方法處理返回回來的類似字典的格式 jsondata = json.loads(response.text) # 建立一個空的列表,用於處理提取出來的Url image_url = [] # 檢查字典裡是否包含data這個關鍵字 if 'data' in jsondata.keys(): # 從字典裡面提取出data這個鍵的值 for items in jsondata.get('data'): # 從items這個自動中提取thumbURL的值,返回一個url url = items.get('thumbURL') # 空列表新增提取出來的url image_url.append(url) # 新增完成後統一返回 return image_url # 下載用的函式 def downimage(image_url): # 從我們傳入的url列表重依次取出url for url in image_url: # 異常處理 try: # 請求我們圖片地址得到返回 response = requests.get(url) # 判斷響應碼 if response.status_code == 200: # 用切片的方法給圖片命名 name = url.split(',')[-1].split('&')[0] # 以位元組的方式讀取讀取資料 image = response.content # 建立一個空白的檔案,以二進位制的方式寫入資料 # 給創建出來的這個檔案取個別名叫f with open('./images/%s.jpg' % name, 'wb') as f: # 寫入我們的2進位制資料 f.write(image) except: print('當前請求出錯') def main(): url = page_url_cont() # 獲得的響應 被賦值 GEI請求賦值 response = requests.get(url) # 用json格式讀取返回的文字資料 # print(jsondata.keys()) image_url = get_images_url(response) # print(image_url) # 傳入我們的下載器 downimage(image_url) if __name__ == '__main__': main()
利用Python批量下載鬥圖網表情包
# encoding: utf-8 # 第一步:匯入第三方庫 import requests # 請求網頁 from lxml import etree # 解析網頁 from urllib import request # 下載內容 import os # 系統包 # 第二步:抓取目標網頁 def parse_page(url): # 2.1簡單的反爬蟲機制 HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'} # 2.2獲取伺服器響應 # 獲取請求響應的狀態碼 response = requests.get(url, headers=HEADERS) # 2.3 從響應裡提取出網頁 # 獲取整個網頁 text = response.text # 將抓取的網頁作為引數返回 return text # 第三步:解析網頁獲取資料 def page_list(text): # 3.1 將網頁儲存在HTML物件裡 html = etree.HTML(text) # 把網頁儲存在HTML物件裡,便於對資料進行提取 # 3.2 從網頁物件根據一定的規則提取資料 imgs = html.xpath("//div[@class='page-content text-center']//a//img") # 獲取所有的表情圖片以物件的形式儲存在列表裡 return imgs def main(): # 根據URL的規則對URL進行迴圈,獲取多頁URL,並傳給parse_page()進行抓取資料 for x in range(1, 11): url = "http://www.doutula.com/photo/list/?page=%d" % x # 呼叫parse_page()函式並把url給進去 text=parse_page(url) # 接收parse_page()返回的網頁,並傳給page_list()函式進行解析 imgs = page_list(text) # 3.3 對列表裡的img物件進行提取 for img in imgs: # try.... except 對程式進行異常處理,避免因為其他原因報錯 try: # 3.4獲取所有表情的URL並儲存在列表裡 imgurl = img.xpath(".//@data-original") # 3.5從列表裡提取出表情的URL,至於為啥不取零,列表中有空值,取零報錯 for img_url in imgurl: # print(img_url) # 分割字尾名:.jpg .png # 3.6對錶情的URL進行處理,提取出表情圖片的格式,用於組成表情的名字 suffix = os.path.splitext(img_url)[1] suffix = suffix.split("!")[0] # 3.7獲取表情的名字 alt = img.xpath(".//@alt")[0] # alt = re.sub(r'[,。??,/\\·]','',alt) #利用正則表示式對錶情名字中存在的特殊字元進行處理 # 3.8用 alt+suffix組成表情的新名字 img_name = alt + suffix # 第四步:計算機代替下載 # 使用request.urlretrieve()對錶情進行下載並儲存在images檔案裡 request.urlretrieve(img_url, 'images/' + img_name) # 打印出那些表情已經下載 print(img_name + '下載完畢!') except: print("表情報錯") # 執行函式 if __name__ == '__main__': main()
模擬登陸京東
# -*- coding:utf-8 -*- import time import requests from bs4 import BeautifulSoup class JD_crawl: def __init__(self, username, password): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36' ' (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36', 'Referer': 'https://www.jd.com/', } self.login_url = "https://passport.jd.com/new/login.aspx" self.post_url = "https://passport.jd.com/uc/loginService" self.auth_url = "https://passport.jd.com/uc/showAuthCode" self.session = requests.session() self.username = username self.password = password def get_login_info(self): html = self.session.get(self.login_url, headers=self.headers).content soup = BeautifulSoup(html, 'lxml') uuid = soup.select('#uuid')[0].get('value') eid = soup.select('#eid')[0].get('value') fp = soup.select('input[name="fp"]')[0].get('value') # session id _t = soup.select('input[name="_t"]')[0].get('value') # token login_type = soup.select('input[name="loginType"]')[0].get('value') pub_key = soup.select('input[name="pubKey"]')[0].get('value') sa_token = soup.select('input[name="sa_token"]')[0].get('value') auth_page = self.session.post(self.auth_url, data={'loginName': self.username, 'nloginpwd': self.password}).text if 'true' in auth_page: auth_code_url = soup.select('#JD_Verification1')[0].get('src2') auth_code = str(self.get_auth_img(auth_code_url)) else: auth_code = '' data = { 'uuid': uuid, 'eid': eid, 'fp': fp, '_t': _t, 'loginType': login_type, 'loginname': self.username, 'nloginpwd': self.password, 'chkRememberMe': True, 'pubKey': pub_key, 'sa_token': sa_token, 'authcode': auth_code } return data def get_auth_img(self, url): auth_code_url = 'http:{}&yys={}'.format(url, str(int(time.time()*1000))) auth_img = self.session.get(auth_code_url, headers=self.headers) with open('authcode.jpg', 'wb') as f: f.write(auth_img.content) code_typein = input('請根據下載圖片輸入驗證碼:') return code_typein def login(self): data = self.get_login_info() headers = { 'Referer': self.post_url, 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36' ' (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } try: login_page = self.session.post(self.post_url, data=data, headers=headers) print(login_page.text) except Exception as e: print(e) # self.session.cookies.clear() def shopping(self): login = self.session.post('https://cart.jd.com/cart.action', headers=self.headers) print(login.text) if __name__ == '__main__': un = input('請輸入京東賬號:') pwd = input('請輸入京東密碼:') jd = JD_crawl(un, pwd) jd.login() jd.shopping()
利用Python爬取喜馬拉雅音訊檔案
import re
import requests
from lxml import etree
from onexima import Xima
def get_id():
"""獲取排行榜每一本書的資訊"""
main_url = "https://www.ximalaya.com/shangye/top/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
r = requests.get(main_url, headers=headers)
# 獲取到當前頁面的xml資料
html = etree.HTML(r.content.decode())
# 得到每一本書的位置的資訊
div_list = html.xpath("//div[contains(@class,'e-2997888007 rrc-album-item')]")
all_lsit = [] # 待會把每一本書的音訊以字典形式放進列表當中
for div in div_list:
author = {} # 建立一個列表, 我們要獲取書的id和書的名字, 並且一一對應
r = div.xpath("./a/@href")[0] # 獲取到當前書的id所在資訊, 資料為: /renwen/4859823/
print(r)
# 所以得通過正則把正確的id取出來, id是為了傳入正確的id, 得到正確的json資料
author['id'] = re.search(r'\/.*?\/(.*)\/', r).group(1)
author['book_name'] = div.xpath("./a/div[3]/div[1]/span/text()")[0]
# 向列表中傳入每一個音訊的資訊
all_lsit.append(author)
print(all_lsit)
return all_lsit
# 呼叫函式得到所有每一本書的資訊, 是一個列表型別
all_lsit = get_id()
for i in all_lsit:
# 遍歷列表, 把每本書對應的id和對應的書名傳到類裡面去
x = Xima(i['id'], i['book_name'])
x.run()
利用Python爬取妹子圖
import requests
from bs4 import BeautifulSoup
import os
import re
Hostreferer = {
'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Referer':'http://www.mzitu.com'
}
Picreferer = {
'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Referer':'http://i.meizitu.net'
}
def get_page_name(url):#獲得圖集最大頁數和名稱
html = get_html(url)
soup = BeautifulSoup(html, 'lxml')
span = soup.findAll('span')
title = soup.find('h2', class_="main-title")
return span[10].text, title.text
def get_html(url):#獲得頁面html程式碼
req = requests.get(url, headers=Hostreferer)
html = req.text
return html
def get_img_url(url, name):
html = get_html(url)
soup = BeautifulSoup(html, 'lxml')
img_url = soup.find('img', alt= name)
return img_url['src']
def save_img(img_url, count, name):
req = requests.get(img_url, headers=Picreferer)
new_name = rename(name)
with open(new_name+'/'+str(count)+'.jpg', 'wb') as f:
f.write(req.content)
def rename(name):
rstr = r'[\/\\\:\*\?\<\>\|]'
new_name = re.sub(rstr, "", name)
return new_name
def save_one_atlas(old_url):
page, name = get_page_name(old_url)
new_name = rename(name)
os.mkdir(new_name)
print("圖集--" + name + "--開始儲存")
for i in range(1, int(page)+1):
url = old_url + "/" + str(i)
img_url = get_img_url(url, name)
# print(img_url)
save_img(img_url, i, name)
print('正在儲存第' + str(i) + '張圖片')
print("圖集--" + name + "儲存成功")
def get_atlas_list(url):
req = requests.get(url, headers=Hostreferer)
soup = BeautifulSoup(req.text, 'lxml')
atlas = soup.find_all(attrs={'class':'lazy'})
atlas_list = []
for atla in atlas:
atlas_list.append(atla.parent['href'])
return atlas_list
def save_one_page(start_url):
atlas_url = get_atlas_list(start_url)
for url in atlas_url:
save_one_atlas(url)
if __name__ == '__main__':
start_url = "http://www.mzitu.com/"
for count in range(1, 3):
url = start_url + "page/" + str(count) +"/"
save_one_page(url)
print("爬取完成")
Python製作微信朋友圈九宮圖
from PIL import Image
import sys
#先將 input image 填充為正方形
def fill_image(image):
width, height = image.size
#選取長和寬中較大值作為新圖片的
new_image_length = width if width > height else height
#生成新圖片[白底]
new_image = Image.new(image.mode, (new_image_length, new_image_length), color='white') #注意這個函式!
#將之前的圖貼上在新圖上,居中
if width > height:#原圖寬大於高,則填充圖片的豎直維度 #(x,y)二元組表示貼上上圖相對下圖的起始位置,是個座標點。
new_image.paste(image, (0, int((new_image_length - height) / 2)))
else:
new_image.paste(image, (int((new_image_length - width) / 2),0))
return new_image
def cut_image(image):
width, height = image.size
item_width = int(width / 3) #因為朋友圈一行放3張圖。
box_list = []
# (left, upper, right, lower)
for i in range(0,3):
for j in range(0,3):
#print((i*item_width,j*item_width,(i+1)*item_width,(j+1)*item_width))
box = (j*item_width,i*item_width,(j+1)*item_width,(i+1)*item_width)
box_list.append(box)
image_list = [image.crop(box) for box in box_list]
return image_list
#儲存
def save_images(image_list):
index = 1
for image in image_list:
image.save(str(index) + '.png', 'PNG')
index += 1
if __name__ == '__main__':
file_path = "4.jpg"
image = Image.open(file_path)
#image.show()
image = fill_image(image)
image_list = cut_image(image)
save_images(image_list)
利用Python爬取LOL官網
# -*- coding:utf-8 -*-
import requests
import re
import json
#獲取JS原始碼 獲取英雄的ID
#拼接URL地址
#獲取下載圖片的地址
#下載圖片
#駝峰命名法
#獲取英雄圖片
def getLOLImages():
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36'}
url_js = 'http://lol.qq.com/biz/hero/champion.js'
#獲取JS原始碼 str bytrs位元組
res_js = requests.get(url_js).content
#轉碼
html_js = res_js.decode()
#正則表達
req = '"keys":(.*?),"data"'
list_js = re.findall(req,html_js)
#print(list_js[0])
# str → dict
dict_js = json.loads(list_js[0])
#print(dict_js)
#定義圖片列表
pic_list = []
for key in dict_js:
#print(key)
for i in range(20):
num = str(i)
if len(num) == 1:
hreo_num = "00"+num
elif len(num) == 2:
hreo_num = "0"+num
numstr = key+hreo_num
url = "http://ossweb-img.qq.com/images/lol/web201310/skin/big"+numstr+".jpg"
#print(url)
pic_list.append(url)
list_filepath = []
path = "E:\\文章\\LOL官網\LOLpic\\"
#print(dict_js.values())
for name in dict_js.values():
for i in range(20):
file_path = path + name + str(i) + '.jpg'
list_filepath.append(file_path)
#print(list_filepath)
n = 0
for picurl in pic_list:
res = requests.get(picurl)
n+=1
if res.status_code ==200:
print("正在下載%s"%list_filepath[n])
#time.sleep(1)
with open(list_filepath[n],'wb') as f:
f.write(res.content)
getLOLImages()