1. 程式人生 > >python 爬取電影名、電視名、或人民

python 爬取電影名、電視名、或人民


 #!/usr/bin/bash
# -*- coding: utf-8 -*-

import re
import urllib2
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

#根據指定的URL獲取網頁內容
def gethtml(url):
    req = urllib2.Request(url) 
    response = urllib2.urlopen(req) 
    html = response.read()
    return html

#獲取分頁資料
def getname(html): bs=BeautifulSoup(html) tmp=bs.find_all('a',target='_blank') #rel=u'([\u4E00-\u9FA5]+?)' rel=r'target=\"_blank\"\>(.+?)\<' names=re.findall(rel,str(tmp)) return names def save(url): html=gethtml(url) pname=getname(html) global fo for x in pname: #print x.decode("unicode_escape")
#fo.write(x.decode('unicode_escape')+'\n') fo.write(x.decode('utf-8')+'\n') #獲取主頁分類 def getmain(html): bs=BeautifulSoup(html) tmp=bs.find_all('a',class_='html-attribute-value html-external-link') rel=r'href=\"(http://.[^w][^\"]+?)\"' tags=re.findall(rel,str(tmp)) return
tags url_main="http://www.resgain.net/xmdq.html" #html_main=gethtml(url_main) html_main=open("t.html") filename="rename.txt" fo=open(filename,"w") all_tag=getmain(html_main) #print all_tag for i in all_tag: print i save(i) i1=i[:-5] i2=i[-5:] for j in range(2,11): url_child=i1+'_'+str(j)+i2 print url_child save(url_child) fo.close()