爬豆瓣没有任何反应
来源:8-3 项目作业
心有所往
2020-05-27 13:42:07
import requests
from concurrent.futures.thread import ThreadPoolExecutor
from lxml import etree
from douban_python_study.handle_mongo import douban_mongo
# 爬虫类
class HandleDoubanMovieTop250(object):
def __init__(self):
# 实例目标页列表和请求头
self.page_url = []
self.header = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Host": "movie.douban.com",
"Referer": "https://movie.douban.com/top250?start=0&filter=",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
}
# 构造url方法
def handle_page_url(self):
goal_page_url = 'https://movie.douban.com/top250?start='
# 传递目标页存储于列表并返回
for page in range(0, 251):
page = goal_page_url + str(page)
self.page_url.append(page)
return self.page_url
# 请求文本数据
def handle_request(self, url):
response = requests.get(url=url, headers=self.header)
response.encoding = 'gbk'
result = response.text
return result
# 解析文本数据
def handle_page_detail(self, url):
result = self.handle_request(url)
info_list = []
info = {}
html_douban = etree.HTML(result)
info['movie_name'] = html_douban.xpath("//div[@class='hd']/a/span[contains(@class,string())/text()]")[0]
info['actors_information'] = html_douban.xpath("//div[@class='bd']/p[1]/text()")[0]
info['score'] = html_douban.xpath("//div[@class='bd']/div[@class='star']/span[last()-2]/text()")[0]
info['evaluate'] = html_douban.xpath("//div[@class='bd']/div[@class='star']/span[last()]/text()")[0]
try:
info['describe'] = html_douban.xpath("//div[@class='bd']/p[@class='quote']/span[@class='inq']/text()")[0]
except:
info['describe'] = '无简述'
info['from_url'] = str(url)
info_list.append(info)
# 调用mongodb的函数,并传入电影信息存储数据库里
douban_mongo.handle_save_data(info_list)
# 启动方法执行爬虫
def run(self):
page_url = self.handle_page_url()
# 创建线程池
executor = ThreadPoolExecutor()
for url in page_url:
executor.submit(self.handle_page_detail, url)
# 关闭线程
executor.shutdown()
# 入口函数
def main():
douban = HandleDoubanMovieTop250()
douban.run()
if __name__ == '__main__':
# 调用入口函数
main()
1回答
同学,你好。
1、在构造url时应该是每一页的数据,每一页有25个电影


2、同学需要去掉header中的Accept-Encoding(声明浏览器支持的编码类型),去掉后边的设置编码为gbk即可

3、需要解析当前页面有多少个电影信息,再循环遍历得到具体的内容


如果我的回答解决了您的疑惑,请采纳!祝学习愉快~~~~
相似问题