可以爬出来url,但是最后会报错,而且无法写入数据库是什么原因

来源:7-5 实战—实现爬取信息持久化业务-1

破邪返瞳

2022-07-14 19:48:54

import json

import requests
from multiprocessing import Queue
import re
import threading
from urllib import request
import pymongo
from handle_pymongo import HandleMongo


class Crawl_page(threading.Thread):
    # 用于给页码发送请求
    # 请求的url:https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html
    def __init__(self, thread_name, page_queue, data_queue):
        super(Crawl_page, self).__init__()
        # 线程名称
        self.thread_name = thread_name
        # 页码队列
        self.page_queue = page_queue
        # 数据队列
        self.data_queue = data_queue
        targetUrl = 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html'
        # 代理服务器
        proxyHost = "dyn.horocn.com"
        proxyPort = "50000"

        # 代理隧道验证信息
        proxyUser = "6X5N1738322642599685"
        proxyPass = "7fmcJrTextwKOsEw"

        self.proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxyHost,
            "port": proxyPort,
            "user": proxyUser,
            "pass": proxyPass,
        }
        proxy_handler = request.ProxyHandler({
            "http": self.proxyMeta,
            "https": self.proxyMeta,
        })
        opener = request.build_opener(proxy_handler)
        request.install_opener(opener)
        resp = request.urlopen(targetUrl).read()
        print(resp)
        self.header = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding": "gzip, deflate,br",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            "Cookie": "_uab_collina=165728267287860912213435;guid=e43c9ae66458b2fbcf09698769fca338;nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D;search=jobarea%7E%60000000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21;acw_sc__v3=62cf8b2fa5bd639aae743b33e35ca7db8bf341cb;acw_tc=ac11000116577687513341042e00d72813d8c245686cadc0dfce00083a0651; ssxmod_itna=YqUxuD0DBAitKGIx0dYYIEP7qAK4rThGubR7fx3qGNdUoDZDiqAPGhDC8+KbxO8D+xCFi7P2TF8ejc4eL80rxnx8FZQpI3TDCPGnDBIx33IDYYCDt4DTD34DYDio=GyAqBQDjxAQDjlKDLDAUlD7=D4qjyjDDHKGremqGEDBtXD0QDATbOteDR9qDgzqD+9qDMl4G29kbxxiQ7eptDbrteDDNe9PxkiGn9Ap9Vfeu=MenrZQWXtaW8HEtlBM=Db4QaOWtEckN8x4+aKipsKAQt82DdCD4eQiqaj0+enA/bW2GAYbw=UYwo/rDxDW=DqV0D4D;ssxmod_itna2=YqUxuD0DBAitKGIx0dYYIEP7qAK4rThGubR7fx4A=a=i4AKD/BCxFO09c=FKAPqFnOu4+s=n7uv4CFj0gvPqeo7w0NYog0nRasC87P4r=VYmTrUUrTil4gAYIFmw6O/b4E/GGKWYLm0l2ohz4i/WfA03rK=R254iDU7W8Lo5qXg/wqo2TRcQA3sKAUc3EbBfkWBKnKoNBvC/nAxLffcKWRglR4U3lNGjEWYmgT+/LTh+ABe62DqDec=mg2=A+8pKA0RzQ5+s+ECz9ANdlfNHefX0EWzLT+L51QYBRLGjh/B57jY+XY+WwSKTEUuW4BNIRU0+Ll5tUIfQiEWG58W+Rq9Gmu+9GlKuRpKMnM+b90HjiaO8aLYFR3O3M7vI42bEimEiFpQsWQ5O0G9flxViYffEbbAOWbaWyTbmYgAbUANeWbSEcF+KLpb4wQORN68HtfaPa1IfAm8WbbKMEAngaHyWNkamEGevOa4i5nFPMN9855av3M1LkHNM8ykHyS86GG33WkF=ad/WxDKuYD7=DYKg+mx80exW5eD=",
            "Host": "search.51job.com",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "same-origin",
            "Sec-Fetch-User": "?1",
            # "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0(Windows NT 10.0;Win64; x64)AppleWebKit/537.36(KHTML, like Gecko)Chrome/103.0.0.0 Safari/537.36",
            "sec-ch-ua": "'.Not/A)Brand';v='99','Google Chrome';v='103','Chromium';v='103'",
        }

    def run(self):
        # 多线程的启动方法
        print("当前启动的线程为%s" % self.thread_name)
        # 使用nowait方法不会堵塞,当队列无数据会抛异常
        while not page_flag:
            try:
                # 获取页码
                page = self.page_queue.get_nowait()
            except:
                pass
            else:
                print("当前获取到的页码%s" % page)
                page_url = "https://search.51job.com/list/000000,000000,0000,00,9,99,python,2," + str(page) + ".html"
                print("当前请求的URL为:%s" % page_url)
                try:
                    response = requests.get(url=page_url, headers=self.header)
                except Exception as e:
                    print('e:', e)
                    return
                response.encoding = 'utf-8'
                # print('response',response.text)
                self.data_queue.put(response.text)


class Crawl_html(threading.Thread):
    # 处理页码返回数据
    def __init__(self, thread_name, data_queue,lock):
        super(Crawl_html, self).__init__()
        self.thread_name = thread_name
        self.data_queue = data_queue
        self.lock=lock

    def run(self):
        print("当前处理文本任务的线程为:%s" % self.thread_name)
        while not data_flag:
            try:
                text = self.data_queue.get_nowait()
            except:
                pass
            else:
                result = self.parse(text)
                data=json.loads(result).get('engine_search_result')
                # 存储数据逻辑,借用一个锁,来禁止资源争抢
                # print(result)
                with self.lock:
                    HandleMongo().insert_data(data=data)

    def parse(self, text):
        # 文本处理
        # 匹配数据的正则
        search_data = re.compile(r'window\.__SEARCH_RESULT__\s=\s(.*?)</script>')
        # 抽取数据
        data = search_data.search(text)
        if data:
            job_items = data.group(1)
            return job_items


page_flag = False
data_flag = False


def main():
    # 页码队列
    page_queue = Queue()
    # 网页数据队列
    data_queue = Queue()
    #定义锁
    lock=threading.Lock()

    # 向页码队列中存储页码
    for page in range(1, 911):
        # 通过put方法放入数据
        page_queue.put(page)
    print("当前页码中存储的页码总量为%s" % page_queue.qsize())

    # 爬取线程
    crawlList = ["页码处理线程1号", "页码处理线程2号", "页码处理线程3号"]
    # 用于存储线程
    page_thread_list = []
    # 启动线程
    for thread_name_page in crawlList:
        thread_page = Crawl_page(thread_name=thread_name_page, page_queue=page_queue, data_queue=data_queue)
        # 线程的启动
        thread_page.start()
        page_thread_list.append(thread_page)

    # 文本处理线程
    parseList = ["文本处理线程1号", "文本处理线程2号", "文本处理线程3号"]
    parse_thread_list = []
    for thread_name_parse in parseList:
        thread_parse = Crawl_html(thread_name_parse, data_queue,lock)
        thread_parse.start()
        parse_thread_list.append(thread_parse)

    # 线程的停止
    global page_flag, data_flag
    # empty判断队列是否为空
    while not page_queue.empty():
        pass
    page_flag = True

    # 释放线程
    for thread_page_join in page_thread_list:
        thread_page_join.join()
        print(thread_page_join.thread_name, "处理结束")

    while not data_queue.empty():
        pass
    data_flag = True

    for thread_parse_join in parse_thread_list:
        thread_parse_join.join()
        print(thread_parse_join.thread_name, "处理结束")


if __name__ == '__main__':
    main()
import pymongo

class HandleMongo(object):
    def __init__(self):
        myclient=pymongo.MongoClient("mongodb://127.0.0.1:27017")
        mydb=myclient['db_51job']
        self.mycollection=mydb['collection_51job']

    def insert_data(self,data):
        self.mycollection.insert_many(data)

https://img.mukewang.com/climg/62d00237095493c014060616.jpghttps://img.mukewang.com/climg/62d0024a0923562913390535.jpg


写回答

1回答

好帮手慕小猿

2022-07-15

同学,你好!因网站有进行迭代,所以获取不到数据,同学可做如下修改:

1、将代码response.encoding=“utf-8”改为response.encoding=“gbk”避免返回数据乱码https://img.mukewang.com/climg/62d0e63e0960add107980246.jpg
2、获取json 字段由于“engine_search_result”改为engine_jds”字段即可
https://img.mukewang.com/climg/62d0e7300992441e07670410.jpg
祝学习愉快~

0

0 学习 · 1672 问题

查看课程