python 爬蟲示例,方便日後參考
阿新 • • 發佈:2018-07-07
主函數 cto fin iter rep incr one lines web
def getOneMoviesInfo(Mid,url): import requests from lxml import etree #print(url) data = requests.get(url).text #download the website s = etree.HTML(data) #analyse data picture = s.xpath(‘//*[@id="main"]/section/div[1]/div/div/section/div[1]/div[1]/img/@src‘) if len(picture)== 0: picture = ‘NULL‘ #longPicture = s.xpath(‘//*[@id="media_v4"]/div[2]/div[1]/div/div/section[3]/div[2]/div/div[1]/img/@src‘) name = s.xpath(‘//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[1]/span/a/h2/text()‘) if len(name)==0: print("Mid = %s , failed for a lack of TMDB id "%Mid) return name = s.xpath(‘//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[1]/span/a/h2/text()‘)[0] year = s.xpath(‘//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[1]/span/span/text()‘)[0].strip("(").strip().strip(")") date = s.xpath(‘//*[@id="media_v4"]/div[2]/div[2]/div/section/div[1]/div/section[1]/ul/li[1]/text()‘)[1].strip() brief = s.xpath(‘//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[2]/div/p/text()‘)[0].replace("\n","\\n") mainCreators =s.xpath(‘//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[2]/ol/li‘) #all main creators array writers = [] director = "NULL" for div in mainCreators: if len(div.xpath(‘./p[1]/a/text()‘))== 0: director = ‘NULL‘ writers = [‘NULL‘,‘NULL‘,‘NULL‘] else: creatorName = div.xpath(‘./p[1]/a/text()‘)[0] #print(creatorName) creatorProfession = div.xpath(‘./p[2]/text()‘)[0] #print(creatorProfession) if ‘Director‘ in creatorProfession: director = creatorName elif ‘Screenplay‘ in creatorProfession or ‘Writer‘ in creatorProfession: writers.append(creatorName) stars = [] starsData = s.xpath(‘//*[@id="media_v4"]/div[2]/div[1]/div/div/section[1]/ol/li‘) for div in starsData: star = div.xpath(‘./p[1]/a/text()‘) if len(star)== 0: stars == ["NULL","NULL","NULL"] else: star = star[0] stars.append(star) writerslen = len(writers) starslen=len(stars) for i in range(writerslen,3): writers.append("NULL"); for i in range(starslen,5): stars.append("NULL"); with open(r‘C:\Users\yuqiao\Desktop\testSpider.txt‘,‘a‘,encoding=‘utf-8‘) as f: f.write("{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}\n".format(Mid,name,brief,year,date,director, writers[0],writers[1],writers[2], stars[0],stars[1],stars[2],stars[3],stars[4], picture)) print(Mid) print(name) #______________________________________________________主函數__________________________________________________________ import time with open(r‘C:\Users\yuqiao\Desktop\testSpider.txt‘,‘w‘,encoding=‘utf-8‘) as f: f.write("") language = ‘?language=zh-CN‘ ####################### with open(r‘D:\git\ZiyeMovie\MidURL.txt‘, "rt",encoding=‘utf-8‘) as in_file: all = in_file.read() lines = all.split("\n") #for i in range(51,61): 51~60 for i in range(9124,9125): line = lines[i] print(line) print(‘finished‘)
python 爬蟲示例,方便日後參考