注:处理异步加载需要模拟浏览器登陆,然后用import json,用loads解析
例如:
代码:
1 #! /usr/bin/env python 2 # -*- coding=utf-8 -*- 3 import requests 4 import json 5 import re 6 import sys 7 reload(sys) 8 sys.setdefaultencoding("utf-8") 9 classinfo = []10 f = open('info.txt','w')11 12 num = 013 def write(htm):14 titl = re.findall('data-tit(.*?)data-enough',htm.text,re.S)15 for each in titl:16 print each17 info = {}18 #print each19 info['title'] = re.search('le="(.*?)"',each,re.S).group(1)20 info['year'] = re.search('data-release="(.*?)" data',each,re.S).group(1)21 info['Rating']= re.findall('data-rate="(.*?)" data-star',each,re.S)[0]22 info['time'] = re.findall('data-duration="(.*?)" data-re',each,re.S)[0]23 info['reg'] = re.findall('data-region="(.*?)" data-dir',each,re.S)[0]24 info['act'] = re.findall('data-actors="(.*?)" data-in',each,re.S)[0]25 global num26 num = num + 127 f.writelines('%d\n' %num)28 f.writelines(u'电影名:'+info['title'] + '\n')29 f.writelines(u'主演:'+info['act'] + '\n')30 f.writelines(u'电影地区:' + info['reg']+'\n')31 f.writelines(u'上映年份:' + info['year']+'\n')32 f.writelines(u'电影时长:' + info['time']+'\n')33 f.writelines(u'评分:' + info['Rating']+'\n\n')34 def write1(info):35 global num36 num = num + 137 f.writelines('%d\n' %num)38 f.writelines(u'电影名:'+info['title'] + '\n')39 f.writelines(u'评分:' + info['Rating']+'\n')40 f.writelines(u'链接:'+info['url'] + '\n\n')41 def getry():42 # html = requests.get('http://movie.douban.com/')43 url = 'http://movie.douban.com/'44 html = requests.get(url)45 html.encoding = 'utf-8'46 #print html.text47 write(html)48 def getrm():49 info = {}50 url = 'http://movie.douban.com/j/search_subjects?type=movie&tag=热门&sort=recommend&page_limit=20&page_start=0'51 head = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'}52 html = requests.get(url,headers = head)53 for i in range(0,16):54 newurl = re.sub('start=\d+','start=%d'%(i*20),url,re.S)55 #print newurl56 jscontent = requests.get(newurl,headers = head).content57 jsdict = json.loads(jscontent)#将json解析成表文件58 for i in range(0,20):59 #print jsdict['subjects'][i]['url']60 info['title'] = jsdict['subjects'][i]['title']61 info['Rating'] = jsdict['subjects'][i]['rate']62 info['url'] = jsdict['subjects'][i]['url']63 write1(info)64 if __name__ == "__main__":65 getry()66 getrm()
效果图: