Python程式設計入門學習筆記(七)
阿新 • • 發佈:2018-12-25
簡單爬蟲
python庫
1、request 用來獲取頁面內容
2、BeautifulSoup
文件連結:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html
爬取鏈家網的資訊
安裝第三方庫
pip install requests
pip install bs4
新建資料庫:
CREATE DATABASE /*!32312 IF NOT EXISTS*/`house` /*!40100 DEFAULT CHARACTER SET utf8 */; USE `house`; /*Table structure for table `db_house` */ DROP TABLE IF EXISTS `db_house`; CREATE TABLE `db_house` ( `id` int(11) unsigned NOT NULL AUTO_INCREMENT, `price` varchar(80) DEFAULT NULL, `unit` varchar(80) DEFAULT NULL, `area` varchar(80) DEFAULT NULL, `layout` varchar(80) DEFAULT NULL, `floor` varchar(80) DEFAULT NULL, `direction` varchar(80) DEFAULT NULL, `subway` varchar(80) DEFAULT NULL, `community` varchar(80) DEFAULT NULL, `location` varchar(80) DEFAULT NULL, `agent_name` varchar(80) DEFAULT NULL, `agent_id` varchar(80) DEFAULT NULL, PRIMARY KEY (`id`), KEY `id` (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=40 DEFAULT CHARSET=utf8;
爬蟲程式如下:
import time
import pymysql
import requests
from bs4 import BeautifulSoup
# 獲取url下的頁面內容,返回soup物件 def get_page(url): response = requests.get(url) soup = BeautifulSoup(response.text,'html5lib') return soup # 將以上的程式碼封裝成一個獲取連結的方法函式,作用是獲取列表頁下面的所有租房頁面的連結,返回連結列表 def get_links(link_url): soup = get_page(link_url) links_div = soup.find_all('div',class_="pic-panel") links = [div.a.get('href') for div in links_div] return links def get_house_info(house_url): soup = get_page(house_url) price = soup.find('span', class_='total').text unit = soup.find('span', class_='unit').text.strip() house_info = soup.find_all('p') area = house_info[0].text[3:] layout = house_info[1].text[5:] floor = house_info[2].text[3:] direction = house_info[3].text[5:] subway = house_info[4].text[3:] community = house_info[5].text[3:] location = house_info[6].text[3:] create_time = house_info[7].text[3:] agent = soup.find('a',class_ = 'name LOGCLICK') agent_name = agent.text agent_id = agent.get('data-el') evaluate = soup.find('div',class_='evaluate') score, number = evaluate.find('span', class_ = 'rate').text.split('/') times = evaluate.find('span',class_ = 'time').text[5:-1] info = { '價格': price, '單位': unit, '面積': area, '戶型': layout, '樓層': floor, '朝向': direction, '釋出時間': create_time, '地鐵': subway, '小區': community, '位置': location, '經紀人姓名': agent_name, '經紀人ID': agent_id } return info DATABASE = { 'host': 'localhost', #如果是遠端資料庫,此處為遠端伺服器的ip地址 'database': 'house', 'user' : 'root', 'password': 'toor', # 字符集編碼,防止資料亂碼 'charset' : 'utf8' } def get_db(setting): return pymysql.connect(**setting) def insert(db,house): values = "'{}',"* 10 + "'{}'" sql_values = values.format(house['價格'],house['單位'],house['面積'],house['戶型'], house['樓層'],house['朝向'],house['地鐵'],house['小區'], house['位置'],house['經紀人姓名'],house['經紀人ID']) sql = """ insert into db_house(`price`,`unit`,`area`,`layout`,`floor`,`direction`,`subway`,`community`,`location`,`agent_name`,`agent_id`) values({}) """.format(sql_values) print(sql) cursor = db.cursor() cursor.execute(sql) db.commit()
db = get_db(DATABASE)
links = get_links('http://bj.lianjia.com/zufang/')
for link in links:
time.sleep(2)
print('獲取一個房子資訊成功!')
house = get_house_info(link)
print(house,end='\r')
insert(db,house)
開啟資料庫,可以看到租房資訊已經儲存到mysql資料庫。