1. 程式人生 > >python爬蟲小例項

python爬蟲小例項

1、python爬取貼吧桌布

1.1、獲取整個頁面資料

#coding=utf-8
import urllib

def getHtml(url):
    page = urllib.urlopen(url)
    html = page.read()
    return html

html = getHtml("http://tieba.baidu.com/p/2738151262")

print html
複製程式碼




1.2、篩選頁面中想要的資料

import re
import urllib

def getHtml(url):
    page = urllib.urlopen(url)
    html 
= page.read() return html def getImg(html): reg = r'src="(.+?\.jpg)" ' imgre = re.compile(reg) imglist = re.findall(imgre,html) return imglist html = getHtml("http://tieba.baidu.com/p/2460150866") print getImg(html) 1.3、將頁面篩選的資料儲存到本地 #coding=utf-8 import urllib import
re def getHtml(url): page = urllib.urlopen(url) html = page.read() return html def getImg(html): reg = r'src="(.+?\.jpg)" ' imgre = re.compile(reg) imglist = re.findall(imgre,html) x = 0 for imgurl in imglist: urllib.urlretrieve(imgurl,'%s.jpg' % x) x
+=1 html = getHtml("http://tieba.baidu.com/p/2460150866") print getImg(html) 抓取暱圖網圖片 --修改版 #coding=utf-8 import urllib import re def getHtml(url): page = urllib.urlopen(url) html = page.read() return html def getImg(html): reg = r'src="(.*?)" ' imgre = re.compile(reg) imglist = re.findall(imgre,html) x = 0 for imgurl in imglist: urllib.urlretrieve(imgurl,'D:360\\%s.jpg' % x) x+=1 html = getHtml("http://www.nipic.com/show/17742538.html") print getImg(html) 解釋: %s意思是字串引數,就是將變數的值傳入到字串裡面,字串後的'%'後就是寫要傳入的引數。 在你給出的例子中,就是用x的值替代%s。比如說x=5,那麼就是爬取url後面是'5.jpg'這個圖片 儲存的位置預設為程式的存放目錄 如何儲存到指定目錄:urllib.urlretrieve(imgurl,'D:360\\%s.jpg' % x) https://image.baidu.com/search/detail?ct=503316480&z=0&ipn=false&word 2、python抓取價格 前兩個不用加 text #-*—coding:utf8-*- from lxml import etree import urllib import urllib.request #headers構造一個字典,裡面儲存了user-agent #headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } url="http://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=194960f41c994e81ada43edbc276f54b" html = urllib.request.urlopen(url).read() data=html.decode('utf-8') selector = etree.HTML(data) #xpath qiubai_text = selector.xpath('//div/ul/li/div/div/strong/i/text()') #print(qiubai_text) for i in qiubai_text: print(i) 或者 #-*—coding:utf8-*- from lxml import etree import urllib import urllib.request #headers構造一個字典,裡面儲存了user-agent #headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } url="http://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=194960f41c994e81ada43edbc276f54b" html = urllib.request.urlopen(url).read() selector = etree.HTML(html) #xpath qiubai_text = selector.xpath('//div/ul/li/div/div/strong/i/text()') #print(qiubai_text) for i in qiubai_text: print(i) 或者 :注意:這個需要加text html.text #-*—coding:utf8-*- from lxml import etree import requests #headers構造一個字典,裡面儲存了user-agent #headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } url="http://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=194960f41c994e81ada43edbc276f54b" html = requests.get(url) selector = etree.HTML(html.text) #xpath qiubai_text = selector.xpath('//div/ul/li/div/div/strong/i/text()') #print(qiubai_text) for i in qiubai_text: print(i) 3、python爬取暱圖網圖片 #coding=utf-8 import urllib import re def getHtml(url): page = urllib.urlopen(url) html = page.read() return html def getImg(html): reg = r'src="(.*?)" ' imgre = re.compile(reg) imglist = re.findall(imgre,html) x = 0 for imgurl in imglist: urllib.urlretrieve(imgurl,'D:360\\%s.jpg' % x) x+=1 html = getHtml("http://www.nipic.com/show/17742538.html") print getImg(html) 4、爬音樂 # coding:utf-8 import urllib import urllib.request import re url="http://www.yy8844.cn/ting/ccceo/ceeivi.shtml" html = urllib.request.urlopen(url).read() data=html.decode('GBK') #print(data) music_id = int(re.findall(r'MusicId=(\d+)',data)[0]) music_name = re.findall(r'<title>(.*?)</title>',data)[0].split('/')[0].strip() music_word = re.findall(r'<div class="textgeci_show" id="showtext">(.*?)</div>',data,re.S)[0] article='word' with open("%s.txt" % article,'w') as f: f.write(music_word) #print(music_word) quanurl="http://96.ierge.cn/"'%d/%d/%s' % (music_id//30000,music_id//2000,music_id)+".mp3" #print(quanurl) bata=urllib.request.urlopen(quanurl).read() with open("%s.mp3" % music_name,'wb') as f: f.write(bata) 注意問題: music_word = re.findall(r'<div class="textgeci_show" id="showtext">(.*?)</div>',data,re.S)[0] python中AttributeError解決 【Python 指令碼報錯】AttributeError:'module' has no attribute 'xxx'的解決方法 http://blog.csdn.net/cn_wk/article/details/50839159 int庫的.pyc檔案 python 去掉 .pyc http://blog.csdn.net/ubuntu64fan/article/details/48241985 python操作物件屬性 http://www.jianshu.com/p/c38a81b8cb38 Python學習日記4|python爬蟲常見報錯小結及解決方法 http://www.jianshu.com/p/17c921639ad0 #coding=utf-8 from Tkinter import * import tkMessageBox import urllib import json import mp3play import time import threading from pinyin import PinYin import os import stat test = PinYin() test.load_word() stop=0 def music(): if not entry.get(): tkMessageBox.showinfo("溫馨提示","搜尋內容不能為空") return name = test.hanzi2pinyin_split(entry.get()) html=urllib.urlopen("http://s.music.163.com/search/get/?type=1&s=%s&limit=9"%name).read() js=json.loads(html) n = 0 global x x = [] for i in js['result']['songs']: listbox.insert(n,'%s(%s)'%(i['name'],i['artists'][0]['name'])) n+=1 x.append(i['audio']) count = 0 #isplaying = None def play(): global count count += 1 index=listbox.curselection() var1.set(u"正在載入"+listbox.get(index,last=None)) urllib.urlretrieve(x[index[0]],'tmp%s.mp3'%str(count)) var1.set(u"正在播放"+listbox.get(index,last=None)) mp3=mp3play.load("tmp%s.mp3"%str(count)) mp3.play() time.sleep(mp3.seconds()) import inspect import ctypes def _async_raise(tid, exctype): """raises the exception, performs cleanup if needed""" tid = ctypes.c_long(tid) if not inspect.isclass(exctype): exctype = type(exctype) res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype)) if res == 0: raise ValueError("invalid thread id") elif res != 1: ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None) raise SystemError("PyThreadState_SetAsyncExc failed") def stop_thread(thread): _async_raise(thread.ident, SystemExit) threads=list() t=None def excute(event): global t for i in threads: stop_thread(i) t = threading.Thread(target=play) t.setDaemon(True) t.start() threads.append(t) root = Tk()#建立一個視窗 root.title("雲音樂") root.geometry("500x300+500+200") entry=Entry(root)#建立輸入框(單行),置父 entry.pack() btn=Button(root,text="搜 索",command=music) btn.pack()#佈局方式必須用同一種 var=StringVar() listbox=Listbox(root,width=50,listvariable=var) listbox.bind('<Double-Button-1>',excute) listbox.pack() var1=StringVar() label=Label(root,text="雲音樂播放器",fg="purple",textvariable=var1) var1.set("雲音樂播放器") label.pack() root.mainloop()#顯示視窗