Python爬蟲的一些操作

阿新 • • 發佈：2018-10-17

add 一次設置 app new 下載圖片 afa 練手 json

1.先來個不反爬的

"""這個不設置反爬措施，練手最好用"""
import requests
from bs4 import BeautifulSoup


response = requests.get("https://www.autohome.com.cn/news/")
# 轉換編碼
response.encoding = ‘gbk‘
# 封裝html到soup
soup = BeautifulSoup(response.text, ‘html.parser‘)
# 找到匹配的第一個div
div = soup.find(name=‘div‘, attrs={‘id‘: ‘auto-channel-lazyload-article 
‘})
# 找到此div下所有li
li_list = div.find_all(name=‘li‘)
# 循環獲取數據
for li in li_list:
    title = li.find(name=‘h3‘)
    if not title:
        continue
    p = li.find(name=‘p‘)
    a = li.find(name=‘a‘)
    print(title.text)
    print(a.attrs.get(‘href‘))
    print(p.text)
    img = li.find(name=‘img‘)
    src  
= img.get(‘src‘)
    src = "https:" + src
    print(type(src))
    print(type(title.text))

    # 再次發起請求，下載圖片到本地
    file_name = src.rsplit(‘/‘, maxsplit=1)[1]
    ret = requests.get(src)
    with open(file_name, ‘wb‘) as f:
        f.write(ret.content)

View Code

2.來個獲取數據的

"""進階爬蟲1"""
import requests
 
from bs4 import BeautifulSoup


res = requests.get(
    url="http://jandan.net/",
)
soup = BeautifulSoup(res.text, "html.parser")
div = soup.find(name="div", attrs={"id": "content"})
div_list = div.find_all(name="div", attrs={"class": "post f list-post"})
for div in div_list:
    print(div.text.strip())    # 獲取所有文本
    # img = div.find(name="img")
    # src = img.get("src")
    # if not src:
    #     continue
    # src = "https:" + src
    # print(src)    獲取圖片
    # h = div.find(name="h2")
    # a = h.find(name="a")
    # print(a.text)    獲取標題

View Code

3.來個有點難度的

"""爬蟲進階2"""
import requests
# 1. 查看首頁
r1 = requests.get(
    url=‘https://dig.chouti.com/‘,
    headers={
        ‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘
    }
)

# 2. 提交用戶名和密碼
r2 = requests.post(
    url=‘https://dig.chouti.com/login‘,
    headers={
        ‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘
    },
    data={
        ‘phone‘:‘86你的賬號‘,
        ‘password‘:‘你的密碼‘,
        ‘oneMonth‘:1
    },
    cookies=r1.cookies.get_dict()
)


# 3. 點贊
r3 = requests.post(
    url=‘https://dig.chouti.com/link/vote?linksId=20435396‘,
    headers={
        ‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘
    },
    cookies=r1.cookies.get_dict()
)
print(r3.text)

View Code

4.來個再難一點的

"""進階爬取3"""
import requests
import re
from bs4 import BeautifulSoup

# 先偽裝login請求
res = requests.get(
    url="https://passport.lagou.com/login/login.html",
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.360"
    }
)
# print(res.text)   原話(動態token，防禦偽造請求，重復提交)(小坑)
# 笑一會兒
# 獲取token(正則匹配)
X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = ‘(.*?)‘", res.text, re.S)[0]
X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = ‘(.*?)‘", res.text, re.S)[0]

ret = requests.post(
    url="https://passport.lagou.com/login/login.json",      # 登錄網址發送前發個錯的獲取登錄url
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.360",
        "X-Anit-Forge-Token": X_Anti_Forge_Token,
        "X_Anti_Forge_Code": X_Anti_Forge_Code,
        "Referer": "https://passport.lagou.com/login/login.html",     # 上一次提交地址(小坑)
    },
    data={           # 發送post數據
        "isValidate": True,
        "username": 你的賬號,
        "password": "你的密碼",
        "request_form_verifyCode": "",
        "submit": "",
        "challenge": "c87407cd89add055d8f1b54ad579cec8",
    },
    cookies=res.cookies.get_dict(),     # 帶著登錄頁面的cookies獲取權限(小坑)
)

r1 = requests.get(
    url="https://www.lagou.com/zhaopin/Python/?labelWords=label",
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.360",
        "Referer": "https://www.lagou.com/",    # 上一次的登錄網址(可以re匹配一下動態獲取)
    },
    cookies=ret.cookies.get_dict(),
)

soup = BeautifulSoup(r1.text, "html.parser")
div = soup.find(name="div", attrs={"id": "s_position_list"})
li_list = div.find_all(name="li")
for li in li_list:
    title = li.find(name="h3")
    if not title:
        continue
    money = li.find(name="span")
    div = li.find(name="div", attrs={"class": "li_b_l"})
    a = li.find(name="a")
    print(title.text)
    print(money.text)
    print(div.text)
    print(a.text)

View Code

5.來個github的

"""進階爬取4"""
import requests
from bs4 import BeautifulSoup


r1 = requests.get(
    url="https://github.com/session",     # 這點註意url,登錄是login獲取cookies是session(小坑)
    headers={
        ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘,
    }
)
soup = BeautifulSoup(r1.text, "html.parser")
inp = soup.find(name="input", attrs={"name": "authenticity_token"})
cookies = r1.cookies.get_dict()
token = inp.get("value")
# 登錄
r2 = requests.post(
    url="https://github.com/login",
    headers={
        ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘,
    },
    data={
        "commit": "Sign in",
        "utf8": "?",
        "authenticity_token": token,
        "login": "你的賬號",
        "password": "你的密碼",
    },
    cookies=cookies
)
# 後續要啥隨你
print(r2.text)

View Code

Python爬蟲的一些操作

python爬蟲一些基本編碼語句

#coding=utf-8 import requests import re from bs4 import BeautifulSoup #BeautifulSoup正則表示式搜尋 html = """ <html><head><title>The Dormo

Python爬蟲的一些操作

add 一次設置 app new 下載圖片 afa 練手 json 1.先來個不反爬的 """這個不設置反爬措施，練手最好用""" import requests from bs4 import BeautifulSoup response = requests.

Python學習雜記_2_字符串相關的一些操作

操作 lan put lease float for pan pytho 字符 name=input("Please input your name: ") sex=input("Please input your sex: ") print("Welcom"+name

第一次寫，python爬蟲圖片，操作excel。

comment org ems exc strip() all 全局變量習慣生成　　第一次寫博客，其實老早就註冊博客園了，有寫博客的想法，就是沒有行動，總是學了忘，忘了丟，最後啥都沒有，電腦裏零零散散，東找找，西看看，今天認識到寫博客的重要性。　　最近閑著看了潭州教

python爬蟲——對爬到的數據進行清洗的一些姿勢（5）

weibo 英雄 mina ret term creators 刪除動畫任務　　做爬蟲，當然就要用數據。想拿數據進行分析，首先清洗數據。這個清洗數據包括清除無用數據列和維度，刪除相同數據，對數據進行勘誤之類的。　　從各大不同新聞網站可以爬到重復新聞。。。這個可以有。

[轉]用python爬蟲抓站的一些技巧總結 zz

內容 req xxxxx pic 個數相關 choice 都是 observe 來源網站：http://www.pythonclub.org/python-network-application/observer-spider 學用python也有3個多月了，用得最

python全棧開發【補充】復習os模塊常用的一些操作

刪除 os.walk post 好用更新 tor inux abs 結構 import os # 1.切換路徑============= d = os.getcwd() #獲取當前的工作路徑 os.chdir(‘D:\\‘)#目錄的切換 print(os.getcwd

[Python爬蟲]使用Selenium操作瀏覽器訂購火車票

cse input 相關動態網頁直接教程 put vba 基礎這個專題主要說的是Python在爬蟲方面的應用，包括爬取和處理部分 [Python爬蟲]使用Python爬取動態網頁-騰訊動漫(Selenium) [Python爬蟲]使用Python爬取靜態網頁-鬥魚直

關於python爬蟲經常要用到的一些Re.正則表達式

tput num -c output lock love spa dfa 全部轉載：https://blog.csdn.net/skyeyesxy/article/details/50837984 1.正則表達式的常用符號與方法常用符號：點號，星號，問號與括號（小括號）

【Python爬蟲學習筆記8-2】MongoDB數據庫操作詳解

參考資料 adding ocl 切換 username 詳解 top .com min 上一篇學習筆記8-1中介紹了MySQL和MongoDB的安裝、啟動和配置，本節我們接著學習有關MongoDB的一些概念、基本操作和在python中的使用。 MongoDB常用概念為更好

爬蟲工程師熬夜寫了這篇文章，關於Python爬蟲的一些方法總結！

爬蟲原理與資料抓取 Requests簡單使用新增 headers 和查詢引數學習Python中有不明白推薦加入交流群

python爬蟲系列(4.2-python操作csv檔案)

一、關於csv檔案的介紹 CSV(Comma-Separated Values)即逗號分隔值，可以用Excel開啟檢視。由於是純文字，任何編輯器也都可開啟。與Excel檔案不同,CSV檔案中：值沒有型別，所有值都是字串不能指定字型顏色等樣式不能指定單元格的寬高，不能合併單元格

關於Python的一些基礎操作

1.列表去重並且保持原來的順序 """ 有一個列表[11, 2, 3, 3, 7, 9, 11, 2, 3],去重並且保持原來的順序. """ numbers = [11, 2, 3, 3, 7, 9, 11, 2, 3] ret = list(set(numbers)) print

python爬蟲框架--Pyspider一些安裝坑

Pyspider一些安裝坑本文主要是新手針對安裝pyspider框架的過程的一下問題： pycharm中安裝pyspider框架安裝pip 利用pip安裝pyspider 執行pyspider UML序列圖和流程圖離線寫部落格

python --爬蟲基礎 --爬取今日頭條使用 requests 庫的基本操作, Ajax

'''思路一: 由於是Ajax的網頁,需要先往下劃幾下看看XHR的內容變化二:分析js中的程式碼內容三:獲取一頁中的內容四:獲取圖片五:儲存在本地使用的庫1. requests 網頁獲取庫 2.from urllib.parse import urlencode 將字典轉化為字串內容整

python爬蟲Scrapy(一)-我爬了boss資料 MongoDB基本命令操作

一、概述學習python有一段時間了，最近了解了下Python的入門爬蟲框架Scrapy，參考了文章Python爬蟲框架Scrapy入門。本篇文章屬於初學經驗記錄，比較簡單，適合剛學習爬蟲的小夥伴。這次我選擇爬取的是boss直聘來資料，畢竟這個網站的

分享一些關於Python爬蟲的原始碼，需要的朋友可以自行領取

利用Python批量下載百度圖片 # !/usr/bin/env python # -*- coding:utf-8 -*- # 匯入URLLIB庫的編碼方法 from urllib.parse import&nb

python爬蟲學習筆記四：BeautifulSoup庫對HTML文字進行操作

只要你提供的資訊是標籤，就可以很好的解析怎麼使用BeautifulSoup庫？ from bs4 import BeautifulSoup soup=BeautifulSoup('<p>data<p>','html.parser'）例如： import

Python的一些小操作

靈衣玉佩，一陰一陽，羅生堂下，秋蘭長生。。。主要是記錄一些學習過程中可能會用到的一些程式碼： 1、擷取單張圖片 #!/usr/bin/python # -*- coding: UTF-8 -*- from PIL import Image import os img = Ima

Python爬蟲：一些常用的爬蟲技巧總結

用python也差不多一年多了，python應用最多的場景還是web快速開發、爬蟲、自動化運維：寫過簡單網站、寫過自動發帖指令碼、寫過收發郵件指令碼、寫過簡單驗證碼識別指令碼。爬蟲在開發過程中也有很多複用的過程，這裡總結一下，以後也能省些事情。基本抓取

Python爬蟲的一些操作

1.先來個不反爬的

2.來個獲取數據的

3.來個有點難度的

4.來個再難一點的

5.來個github的

相關推薦