爬蟲爬當當網書籍信息
阿新 • • 發佈:2018-05-02
表達式 sel soup bs4 cti rom rtt utf system
拖了好久的一個爬蟲
先上代碼 文字慢慢補
1 # -*- coding: utf-8 -* 2 3 import urllib2 4 import xlwt 5 from bs4 import BeautifulSoup 6 from datashape import json 7 import re 8 import json 9 import requests 10 11 12 def getJsonText(url): 13 try: 14 r = requests.get(url, timeout=1) 15 r.raise_for_status() 16 r.encoding = r.apparent_encoding 17 return r.text 18 except: 19 print ‘獲取失敗‘ 20 return ‘‘ 21 22 23 def getgood(url): 24 html = urllib2.urlopen(url).read() 25 26 # 用正則表達式拿取 27 ma = re.search(r‘"productId":"[\d]+"‘, html) 28 productId = eval(ma.group().split(‘:‘)[-1]) 29 categoryPath = eval(ma.group().split(‘:‘)[-1]) 30 mainProductId = eval(ma.group().split(‘:‘)[-1]) 31 # 對Ajax的url進行拼接 32 json_url = ‘http://product.dangdang.com/index.php?r=comment%2Flist&productId={productId}&categoryPath={categoryPath}&mainProductId={mainProductId}&mediumId=0&pageIndex=1&sortType=1&filterType=1&isSystem=1&tagId=0&tagFilterCount=0‘.format( 33 productId=productId, categoryPath=categoryPath, mainProductId=mainProductId) 34 # 調用方法,下載下來json數據 35 json_html = json.loads(getJsonText(json_url)) 36 summary = json_html[‘data‘][‘list‘][‘summary‘] 37 data = {} 38 data[‘all_comment_num‘] = summary[‘total_comment_num‘] # 總評論數 39 data[‘good_comment_num‘] = summary[‘total_crazy_count‘] # 好評數 40 data[‘middle_comment_num‘] = summary[‘total_indifferent_count‘] # 中評數 41 data[‘bad_comment_num‘] = summary[‘total_detest_count‘] # 差評數 42 data[‘good_rate‘] = summary[‘goodRate‘] # 好評率 43 return data 44 45 def main(): 46 wb = xlwt.Workbook() 47 sheet1 = wb.add_sheet("Sheet") 48 sheet1.write(0, 0, unicode(‘序號‘, "utf-8")) 49 sheet1.write(0, 1, unicode(‘書名‘, "utf-8")) 50 sheet1.write(0, 2, unicode(‘價格‘, "utf-8")) 51 sheet1.write(0, 3, unicode(‘折扣‘, "utf-8")) 52 sheet1.write(0, 4, unicode(‘評論數‘, "utf-8")) 53 sheet1.write(0, 5, unicode(‘好評‘, "utf-8")) 54 sheet1.write(0, 6, unicode(‘中評‘, "utf-8")) 55 sheet1.write(0, 7, unicode(‘差評‘, "utf-8")) 56 sheet1.write(0, 8, unicode(‘好評率‘, "utf-8")) 57 58 for page in range(25): 59 60 url = ‘http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-24hours-0-0-1-%d‘ % (page+1) 61 get = urllib2.urlopen(url).read() 62 data = BeautifulSoup(get, ‘lxml‘) 63 64 bookname = data.find_all(‘div‘, attrs={‘class‘: ‘name‘}) 65 bookstar = data.find_all(‘div‘, attrs={‘class‘: ‘star‘}) 66 bookprice = data.find_all(‘div‘, attrs={‘class‘: ‘price‘}) 67 bookoff = data.find_all(‘span‘, attrs={‘class‘: ‘price_s‘}) 68 69 for i in range(20): 70 bookurl = bookname[i].find(‘a‘)[‘href‘] 71 data = getgood(bookurl) 72 print (str(page*20+i+1) + " " 73 + bookname[i].find(‘a‘)[‘title‘] + " " # 書名 74 + bookprice[i].find(‘span‘).text[1:] + " " # 價格 75 + bookoff[i].text[:-1] + " " # 折扣 76 + bookstar[i].find(‘a‘).text[:-3] + " " # 評論數 77 + data[‘good_comment_num‘] + " " # 好評數 78 + data[‘middle_comment_num‘] + " " # 中評數 79 + data[‘bad_comment_num‘] + " " # 差評數 80 + data[‘good_rate‘] + " " # 好評率 81 ) 82 83 sheet1.write(page * 20 + i + 1, 0, page * 20 + i + 1) 84 sheet1.write(page * 20 + i + 1, 1, bookname[i].find(‘a‘)[‘title‘]) 85 sheet1.write(page * 20 + i + 1, 2, bookprice[i].find(‘span‘).text[1:]) 86 sheet1.write(page * 20 + i + 1, 3, bookoff[i].text[:-1]) 87 sheet1.write(page * 20 + i + 1, 4, bookstar[i].find(‘a‘).text[:-3]) 88 sheet1.write(page * 20 + i + 1, 5, data[‘good_comment_num‘]) 89 sheet1.write(page * 20 + i + 1, 6, data[‘middle_comment_num‘]) 90 sheet1.write(page * 20 + i + 1, 7, data[‘bad_comment_num‘]) 91 sheet1.write(page * 20 + i + 1, 8, data[‘good_rate‘]) 92 wb.save(‘test.xls‘) 93 94 main()
爬蟲爬當當網書籍信息