python 爬取電影名、電視名、或人民
阿新 • • 發佈:2019-02-08
#!/usr/bin/bash
# -*- coding: utf-8 -*-
import re
import urllib2
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
#根據指定的URL獲取網頁內容
def gethtml(url):
req = urllib2.Request(url)
response = urllib2.urlopen(req)
html = response.read()
return html
#獲取分頁資料
def getname(html):
bs=BeautifulSoup(html)
tmp=bs.find_all('a',target='_blank')
#rel=u'([\u4E00-\u9FA5]+?)'
rel=r'target=\"_blank\"\>(.+?)\<'
names=re.findall(rel,str(tmp))
return names
def save(url):
html=gethtml(url)
pname=getname(html)
global fo
for x in pname:
#print x.decode("unicode_escape")
#fo.write(x.decode('unicode_escape')+'\n')
fo.write(x.decode('utf-8')+'\n')
#獲取主頁分類
def getmain(html):
bs=BeautifulSoup(html)
tmp=bs.find_all('a',class_='html-attribute-value html-external-link')
rel=r'href=\"(http://.[^w][^\"]+?)\"'
tags=re.findall(rel,str(tmp))
return tags
url_main="http://www.resgain.net/xmdq.html"
#html_main=gethtml(url_main)
html_main=open("t.html")
filename="rename.txt"
fo=open(filename,"w")
all_tag=getmain(html_main)
#print all_tag
for i in all_tag:
print i
save(i)
i1=i[:-5]
i2=i[-5:]
for j in range(2,11):
url_child=i1+'_'+str(j)+i2
print url_child
save(url_child)
fo.close()