python爬蟲小例項

阿新 • • 發佈：2018-12-01

1、python爬取貼吧桌布

1.1、獲取整個頁面資料

#coding=utf-8
import urllib

def getHtml(url):
    page = urllib.urlopen(url)
    html = page.read()
    return html

html = getHtml("http://tieba.baidu.com/p/2738151262")

print html
複製程式碼




1.2、篩選頁面中想要的資料

import re
import urllib

def getHtml(url):
    page = urllib.urlopen(url)
    html  
= page.read()
    return html

def getImg(html):
    reg = r'src="(.+?\.jpg)" '
    imgre = re.compile(reg)
    imglist = re.findall(imgre,html)
    return imglist      
       
html = getHtml("http://tieba.baidu.com/p/2460150866")
print getImg(html)



1.3、將頁面篩選的資料儲存到本地


#coding=utf-8
import urllib
import 
 re

def getHtml(url):
    page = urllib.urlopen(url)
    html = page.read()
    return html

def getImg(html):
    reg = r'src="(.+?\.jpg)" '
    imgre = re.compile(reg)
    imglist = re.findall(imgre,html)
    x = 0
    for imgurl in imglist:
        urllib.urlretrieve(imgurl,'%s.jpg' % x)
        x 
+=1


html = getHtml("http://tieba.baidu.com/p/2460150866")

print getImg(html)


抓取暱圖網圖片 --修改版

#coding=utf-8
import urllib
import re

def getHtml(url):
    page = urllib.urlopen(url)
    html = page.read()
    return html

def getImg(html):
    reg = r'src="(.*?)" '
    imgre = re.compile(reg)
    imglist = re.findall(imgre,html)
    x = 0
    for imgurl in imglist:
        urllib.urlretrieve(imgurl,'D:360\\%s.jpg' % x)
        x+=1


html = getHtml("http://www.nipic.com/show/17742538.html")

print getImg(html)



解釋：


%s意思是字串引數，就是將變數的值傳入到字串裡面，字串後的'%'後就是寫要傳入的引數。
在你給出的例子中，就是用x的值替代%s。比如說x=5，那麼就是爬取url後面是'5.jpg'這個圖片


儲存的位置預設為程式的存放目錄


如何儲存到指定目錄：urllib.urlretrieve(imgurl,'D:360\\%s.jpg' % x)


https://image.baidu.com/search/detail?ct=503316480&z=0&ipn=false&word



2、python抓取價格

前兩個不用加 text


#-*—coding:utf8-*-
from lxml import etree

import urllib
import urllib.request
#headers構造一個字典，裡面儲存了user-agent
#headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' }
url="http://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=194960f41c994e81ada43edbc276f54b"
html = urllib.request.urlopen(url).read()
data=html.decode('utf-8')
selector = etree.HTML(data)
#xpath
qiubai_text = selector.xpath('//div/ul/li/div/div/strong/i/text()')
#print(qiubai_text)
for i in qiubai_text:
    print(i)


或者


#-*—coding:utf8-*-
from lxml import etree

import urllib
import urllib.request
#headers構造一個字典，裡面儲存了user-agent
#headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' }
url="http://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=194960f41c994e81ada43edbc276f54b"
html = urllib.request.urlopen(url).read()
selector = etree.HTML(html)
#xpath
qiubai_text = selector.xpath('//div/ul/li/div/div/strong/i/text()')
#print(qiubai_text)
for i in qiubai_text:
    print(i)



或者    ：注意：這個需要加text         html.text


#-*—coding:utf8-*-
from lxml import etree
import requests
#headers構造一個字典，裡面儲存了user-agent
#headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' }
url="http://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=194960f41c994e81ada43edbc276f54b"
html = requests.get(url)
selector = etree.HTML(html.text)
#xpath
qiubai_text = selector.xpath('//div/ul/li/div/div/strong/i/text()')
#print(qiubai_text)
for i in qiubai_text:
    print(i)


3、python爬取暱圖網圖片

#coding=utf-8
import urllib
import re

def getHtml(url):
    page = urllib.urlopen(url)
    html = page.read()
    return html

def getImg(html):
    reg = r'src="(.*?)" '
    imgre = re.compile(reg)
    imglist = re.findall(imgre,html)
    x = 0
    for imgurl in imglist:
        urllib.urlretrieve(imgurl,'D:360\\%s.jpg' % x)
        x+=1


html = getHtml("http://www.nipic.com/show/17742538.html")

print getImg(html)


4、爬音樂


# coding:utf-8
import urllib
import urllib.request
import re
url="http://www.yy8844.cn/ting/ccceo/ceeivi.shtml"
html = urllib.request.urlopen(url).read()
data=html.decode('GBK')
#print(data)
music_id = int(re.findall(r'MusicId=(\d+)',data)[0])
music_name = re.findall(r'<title>(.*?)</title>',data)[0].split('/')[0].strip()
music_word = re.findall(r'<div class="textgeci_show" id="showtext">(.*?)</div>',data,re.S)[0]
article='word'
with open("%s.txt" % article,'w') as f:
    f.write(music_word)
#print(music_word)
quanurl="http://96.ierge.cn/"'%d/%d/%s' % (music_id//30000,music_id//2000,music_id)+".mp3"
#print(quanurl)
bata=urllib.request.urlopen(quanurl).read()
with open("%s.mp3" % music_name,'wb') as f:
    f.write(bata)


注意問題：

music_word = re.findall(r'<div class="textgeci_show" id="showtext">(.*?)</div>',data,re.S)[0]


python中AttributeError解決

【Python 指令碼報錯】AttributeError:'module' has no attribute 'xxx'的解決方法
http://blog.csdn.net/cn_wk/article/details/50839159


int庫的.pyc檔案


python 去掉 .pyc
http://blog.csdn.net/ubuntu64fan/article/details/48241985


python操作物件屬性
http://www.jianshu.com/p/c38a81b8cb38


Python學習日記4|python爬蟲常見報錯小結及解決方法

http://www.jianshu.com/p/17c921639ad0





#coding=utf-8
from Tkinter import *
import  tkMessageBox
import urllib
import json
import mp3play
import time
import threading
from pinyin import PinYin
import os
import stat
test = PinYin()
test.load_word()
stop=0
def music():
    if not entry.get():
        tkMessageBox.showinfo("溫馨提示","搜尋內容不能為空")
        return
    name = test.hanzi2pinyin_split(entry.get())
    html=urllib.urlopen("http://s.music.163.com/search/get/?type=1&s=%s&limit=9"%name).read()
    js=json.loads(html)
    n = 0
    global x
    x = []
    for i in js['result']['songs']:
        listbox.insert(n,'%s(%s)'%(i['name'],i['artists'][0]['name']))
        n+=1
        x.append(i['audio'])
count = 0
#isplaying = None
def play():
    global count
    count += 1
    index=listbox.curselection()
    var1.set(u"正在載入"+listbox.get(index,last=None))
    urllib.urlretrieve(x[index[0]],'tmp%s.mp3'%str(count))
    var1.set(u"正在播放"+listbox.get(index,last=None))
    mp3=mp3play.load("tmp%s.mp3"%str(count))
    mp3.play()
    time.sleep(mp3.seconds())

import inspect
import ctypes

def _async_raise(tid, exctype):
    """raises the exception, performs cleanup if needed"""
    tid = ctypes.c_long(tid)
    if not inspect.isclass(exctype):
        exctype = type(exctype)
    res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
    if res == 0:
        raise ValueError("invalid thread id")
    elif res != 1:
        ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
        raise SystemError("PyThreadState_SetAsyncExc failed")

def stop_thread(thread):
    _async_raise(thread.ident, SystemExit)
threads=list()
t=None
def excute(event):
    global  t
    for i in threads:
        stop_thread(i)
    t = threading.Thread(target=play)
    t.setDaemon(True)
    t.start()
    threads.append(t)
root = Tk()#建立一個視窗
root.title("雲音樂")
root.geometry("500x300+500+200")
entry=Entry(root)#建立輸入框（單行）,置父
entry.pack()
btn=Button(root,text="搜 索",command=music)
btn.pack()#佈局方式必須用同一種
var=StringVar()
listbox=Listbox(root,width=50,listvariable=var)
listbox.bind('<Double-Button-1>',excute)
listbox.pack()
var1=StringVar()
label=Label(root,text="雲音樂播放器",fg="purple",textvariable=var1)
var1.set("雲音樂播放器")
label.pack()
root.mainloop()#顯示視窗

python爬蟲小例項

1、python爬取貼吧桌布 1.1、獲取整個頁面資料 #coding=utf-8 import urllib def getHtml(url): page = urllib.urlopen(url) html = page.read() return html html

Python爬蟲小白---（二）爬蟲基礎--Selenium PhantomJS

decode bject windows beautiful 結構由於 target header 速度一、前言　　前段時間嘗試爬取了網易雲音樂的歌曲，這次打算爬取QQ音樂的歌曲信息。網易雲音樂歌曲列表是通過iframe展示的，可以借助Selenium獲

Python爬蟲小實踐：尋找失蹤人口，爬取失蹤兒童信息並寫成csv文件，方便存入數據庫

python tor enc mini 執行 gem view 獲取但是前兩天有人私信我，讓我爬這個網站，http://bbs.baobeihuijia.com/forum-191-1.html上的失蹤兒童信息，準備根據失蹤兒童的失蹤時的地理位置來更好的尋找失蹤兒童，這

python爬蟲實例項目大全

agent 相冊 dont 公眾號讀取知識庫 server 微博烏雲 WechatSogou [1]- 微信公眾號爬蟲。基於搜狗微信搜索的微信公眾號爬蟲接口，可以擴展成基於搜狗搜索的爬蟲，返回結果是列表，每一項均是公眾號具體信息字典。 DouBanSpider [2

Python 入門小例項筆記

例項1：列印使用者輸入的姓名與手機號碼知識點：編碼，獲取輸入，變數，標準輸出 1 #encoding=utf-8 2 3 import time 4 5 #1.提示使用者輸入資訊 6 7 name = input ("請輸入您的姓名:")

爬蟲小例項

1.網頁資訊爬取 import requests try: kv={'user-agent': 'Mozilla/5.0'} url='http://www.baidu.com/' r=requests.get(url,headers=kv) r.rais

Python爬蟲新聞例項程式碼

"新聞的爬取到本地的" # 思路：先爬取首頁然後在通過正則表示式獲取所有的新聞連結然後在爬出各類的新聞並存儲本地 #http://news.sina.com.cn/ html="http://news.sina.com.cn/" data=urllib.request.urlopen(ht

學習的一點爬蟲小例項

def function(): pat="[a-zA-Z]+://[^\s]*[.com|.cn]" string='<a herf="http://www.baidu.com>haafdsg</a>' res=re.compi

Python爬蟲小白——（二）爬蟲基礎——Selenium PhantomJS

前段時間嘗試爬取了網易雲音樂的歌曲，這次打算爬取QQ音樂的歌曲資訊。網易雲音樂歌曲列表是通過iframe展示的，可以藉助Selenium獲取到iframe的頁面元素，而QQ音樂採用的是非同步載入的方式，套路不一樣，這是主流的頁面載入方式，爬取有點難度，不過也是對自己的一個挑戰。二、Pyt

Python爬蟲小案例

''' 模組註釋 ''' from urllib import request import re class Spider(): ''' 類的註釋，註釋寫在類下面 '

Python爬蟲小白入門（二）requests庫

轉自：https://www.cnblogs.com/Albert-Lee/p/6230337.html 一、前言為什麼要先說Requests庫呢，因為這是個功能很強大的網路請求庫，可以實現跟瀏覽器一樣傳送各種HTTP請求來獲取網站的資料。網路上的模組、庫、包指的都

Python爬蟲小白入門（一）寫在前面

轉自：https://www.cnblogs.com/Albert-Lee/p/6226699.html 一、前言你是不是在為想收集資料而不知道如何收集而著急？你是不是在為想學習爬蟲而找不到一個專門為小白寫的教程而煩惱？ Bingo! 你沒有看錯，這就

適合新手的Python爬蟲小程式

介紹：此程式是使用python做的一個爬蟲小程式爬取了python百度百科中的部分內容，因為這個demo是根據網站中的靜態結構爬取的，所以如果百度百科詞條的html結構發生變化需要修改部分內容。詞條連結 http://baike.baidu.com/item/

python爬蟲小試例項--爬取網頁圖片並下載

一、python安裝在python的官網下載python版本，需要下載對應版本（在計算機-屬性中檢視自己是32位作業系統還是64位作業系統），我是64位的，就下載64位對應的安裝包了（如下圖：Windows x86-64 executable installer）。官網下載地

Python爬蟲小案例：豆瓣電影TOP250

原始碼： #!/usr/bin/python3 # -*-coding: UTF-8-*- from urllib import request import re class MovieTop250(object): def __init

Python爬蟲小白學習心得（一

四、BeautifulSoup中使用Find和Find_all方法提示想要的內容。如例項物件soup.find_all("div",class_="xxx")#注意class屬性在這裡有個下劃線，要獲取某屬性的值如get('href')的用法。另外還需要了解soup.select的css選擇器方法，最主要

【python爬蟲小實戰】python3.x用requests和bs4實現有道翻譯(中英文)

一直用的是python3.x版本的，剛開始學爬蟲的時候學長給了我個爬有道翻譯的小程式，實現中英文翻譯，由於是用urllib庫的，當時也是剛接觸python，所以一臉懵逼，現在學了一個月了，回頭再看了一下，感覺很時間單，於是就用requests庫和bs4，加上js

python爬蟲【例項】爬取豆瓣電影評分連結並圖示（）-問題如何爬取電影圖片（解決有程式碼）

這裡只有尾巴，來分析一下確定範圍：如何爬取圖片並下載？參考：http://blog.csdn.net/chaoren666/article/details/53488083----------------------------------------------------

[python爬蟲小實戰2]根據使用者輸入關鍵詞爬取今日頭條圖集，並批量下載圖片

這算是比較貼近於實際生活的爬蟲了，根據使用者輸入的關鍵字批量下載今日頭條相關圖集圖片，，核心用到了urllib.request.urlretrieve()這個方法，然後百度了一下進度條怎麼玩，直接把程式碼加上去了，沒毛病，感覺程式碼有些複雜，其實理論上一層網頁可

Python爬蟲小偏方：突破登錄和訪問頻率限制，多研究對方不同終端產品

strong 抓取微信授權分享很多資源賬號仿微信詳細介紹其實在抓取數據時，如果有大量的離散賬號和離散IP的話，抓取數據就問題不大了。但是老猿相信大部分的爬蟲選手們都沒有這麽多的資源，所以就會絞盡腦汁研究和各種嘗試對方的訪問控制策略，如果始終無法破局，這時就要

python爬蟲小例項

相關推薦