爬豆瓣没有任何反应

来源:8-3 项目作业

心有所往

2020-05-27 13:42:07

import requests
from concurrent.futures.thread import ThreadPoolExecutor
from lxml import etree
from douban_python_study.handle_mongo import douban_mongo


# 爬虫类
class HandleDoubanMovieTop250(object):
   def __init__(self):
       # 实例目标页列表和请求头
       self.page_url = []
       self.header = {
           "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
           "Accept-Encoding": "gzip, deflate, br",
           "Accept-Language": "zh-CN,zh;q=0.9",
           "Connection": "keep-alive",
           "Host": "movie.douban.com",
           "Referer": "https://movie.douban.com/top250?start=0&filter=",
           "Sec-Fetch-Mode": "navigate",
           "Sec-Fetch-Site": "same-origin",
           "Sec-Fetch-User": "?1",
           "Upgrade-Insecure-Requests": "1",
           "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
       }

   # 构造url方法
   def handle_page_url(self):
       goal_page_url = 'https://movie.douban.com/top250?start='
       # 传递目标页存储于列表并返回
       for page in range(0, 251):
            page = goal_page_url + str(page)
            self.page_url.append(page)
       return self.page_url

   # 请求文本数据
   def handle_request(self, url):
       response = requests.get(url=url, headers=self.header)
       response.encoding = 'gbk'
       result = response.text
       return result

   # 解析文本数据
   def handle_page_detail(self, url):
       result = self.handle_request(url)
       info_list = []
       info = {}
       html_douban = etree.HTML(result)
       info['movie_name'] = html_douban.xpath("//div[@class='hd']/a/span[contains(@class,string())/text()]")[0]
       info['actors_information'] = html_douban.xpath("//div[@class='bd']/p[1]/text()")[0]
       info['score'] = html_douban.xpath("//div[@class='bd']/div[@class='star']/span[last()-2]/text()")[0]
       info['evaluate'] = html_douban.xpath("//div[@class='bd']/div[@class='star']/span[last()]/text()")[0]
       try:
           info['describe'] = html_douban.xpath("//div[@class='bd']/p[@class='quote']/span[@class='inq']/text()")[0]
       except:
           info['describe'] = '无简述'
       info['from_url'] = str(url)
       info_list.append(info)
       # 调用mongodb的函数,并传入电影信息存储数据库里
       douban_mongo.handle_save_data(info_list)

   # 启动方法执行爬虫
   def run(self):
       page_url = self.handle_page_url()
       # 创建线程池
       executor = ThreadPoolExecutor()
       for url in page_url:
           executor.submit(self.handle_page_detail, url)
       # 关闭线程
       executor.shutdown()


# 入口函数
def main():
   douban = HandleDoubanMovieTop250()
   douban.run()


if __name__ == '__main__':
   # 调用入口函数
   main()

写回答

1回答

时间,

2020-05-27

同学,你好。

1、在构造url时应该是每一页的数据,每一页有25个电影

http://img.mukewang.com/climg/5ece4e6d09132abd08220184.jpg

http://img.mukewang.com/climg/5ece4e0309a04ea705840279.jpg

2、同学需要去掉header中的Accept-Encoding(声明浏览器支持的编码类型),去掉后边的设置编码为gbk即可

http://img.mukewang.com/climg/5ece4e8a09d73b0407620400.jpg

3、需要解析当前页面有多少个电影信息,再循环遍历得到具体的内容

http://img.mukewang.com/climg/5ece4f7609c3a8b308420066.jpghttp://img.mukewang.com/climg/5ece5008091023aa11010156.jpg


如果我的回答解决了您的疑惑,请采纳!祝学习愉快~~~~

0

0 学习 · 1672 问题

查看课程