1. 程式人生 > >Python程式設計入門學習筆記(七)

Python程式設計入門學習筆記(七)

簡單爬蟲

python庫
    1、request 用來獲取頁面內容
    2、BeautifulSoup 

    文件連結:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html

爬取鏈家網的資訊

    

安裝第三方庫

 pip install requests
 pip install bs4

新建資料庫:

CREATE DATABASE /*!32312 IF NOT EXISTS*/`house` /*!40100 DEFAULT CHARACTER SET utf8 */;

USE `house`;

/*Table structure for table `db_house` */

DROP TABLE IF EXISTS `db_house`;

CREATE TABLE `db_house` (
  `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
  `price` varchar(80) DEFAULT NULL,
  `unit` varchar(80) DEFAULT NULL,
  `area` varchar(80) DEFAULT NULL,
  `layout` varchar(80) DEFAULT NULL,
  `floor` varchar(80) DEFAULT NULL,
  `direction` varchar(80) DEFAULT NULL,
  `subway` varchar(80) DEFAULT NULL,
  `community` varchar(80) DEFAULT NULL,
  `location` varchar(80) DEFAULT NULL,
  `agent_name` varchar(80) DEFAULT NULL,
  `agent_id` varchar(80) DEFAULT NULL,
  PRIMARY KEY (`id`),
  KEY `id` (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=40 DEFAULT CHARSET=utf8;

爬蟲程式如下:

import time
import pymysql
import requests
from bs4 import BeautifulSoup
# 獲取url下的頁面內容,返回soup物件
def get_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text,'html5lib')
    return soup

# 將以上的程式碼封裝成一個獲取連結的方法函式,作用是獲取列表頁下面的所有租房頁面的連結,返回連結列表

def get_links(link_url):
    soup = get_page(link_url)
    links_div = soup.find_all('div',class_="pic-panel")
    links = [div.a.get('href') for div in links_div]
    return links

def get_house_info(house_url):
    soup = get_page(house_url)
    price = soup.find('span', class_='total').text
    unit = soup.find('span', class_='unit').text.strip()
    house_info = soup.find_all('p')
    area = house_info[0].text[3:]
    layout = house_info[1].text[5:]
    floor = house_info[2].text[3:]
    direction = house_info[3].text[5:]
    subway = house_info[4].text[3:]
    community = house_info[5].text[3:]
    location = house_info[6].text[3:]
    create_time = house_info[7].text[3:]
    agent = soup.find('a',class_ = 'name LOGCLICK')
    agent_name = agent.text
    agent_id = agent.get('data-el')
    evaluate = soup.find('div',class_='evaluate')
    score, number = evaluate.find('span', class_ = 'rate').text.split('/')
    times = evaluate.find('span',class_ = 'time').text[5:-1]
    info = {
        '價格': price,
        '單位': unit,
        '面積': area,
        '戶型': layout,
        '樓層': floor,
        '朝向': direction,
        '釋出時間': create_time,
        '地鐵': subway,
        '小區': community,
        '位置': location,
        '經紀人姓名': agent_name,
        '經紀人ID': agent_id
    }
    return info

DATABASE = {
    'host': 'localhost', #如果是遠端資料庫,此處為遠端伺服器的ip地址
    'database': 'house',
    'user' : 'root',
    'password': 'toor',
    # 字符集編碼,防止資料亂碼
    'charset' : 'utf8'
}
    
def get_db(setting):
    return pymysql.connect(**setting)

def insert(db,house):
    values = "'{}',"* 10 + "'{}'"
    sql_values = values.format(house['價格'],house['單位'],house['面積'],house['戶型'],
                               house['樓層'],house['朝向'],house['地鐵'],house['小區'],
                               house['位置'],house['經紀人姓名'],house['經紀人ID'])
    sql = """
        insert into db_house(`price`,`unit`,`area`,`layout`,`floor`,`direction`,`subway`,`community`,`location`,`agent_name`,`agent_id`)
        values({})
    """.format(sql_values)
    print(sql)
    cursor = db.cursor()
    cursor.execute(sql)
    db.commit()
db = get_db(DATABASE)
links = get_links('http://bj.lianjia.com/zufang/')
for link in links:
    time.sleep(2)
    print('獲取一個房子資訊成功!')
    house = get_house_info(link)
    print(house,end='\r')
    insert(db,house)

開啟資料庫,可以看到租房資訊已經儲存到mysql資料庫。