可以爬出来url,但是最后会报错,而且无法写入数据库是什么原因
来源:7-5 实战—实现爬取信息持久化业务-1
破邪返瞳
2022-07-14 19:48:54
import json import requests from multiprocessing import Queue import re import threading from urllib import request import pymongo from handle_pymongo import HandleMongo class Crawl_page(threading.Thread): # 用于给页码发送请求 # 请求的url:https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html def __init__(self, thread_name, page_queue, data_queue): super(Crawl_page, self).__init__() # 线程名称 self.thread_name = thread_name # 页码队列 self.page_queue = page_queue # 数据队列 self.data_queue = data_queue targetUrl = 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html' # 代理服务器 proxyHost = "dyn.horocn.com" proxyPort = "50000" # 代理隧道验证信息 proxyUser = "6X5N1738322642599685" proxyPass = "7fmcJrTextwKOsEw" self.proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxyHost, "port": proxyPort, "user": proxyUser, "pass": proxyPass, } proxy_handler = request.ProxyHandler({ "http": self.proxyMeta, "https": self.proxyMeta, }) opener = request.build_opener(proxy_handler) request.install_opener(opener) resp = request.urlopen(targetUrl).read() print(resp) self.header = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Encoding": "gzip, deflate,br", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": "_uab_collina=165728267287860912213435;guid=e43c9ae66458b2fbcf09698769fca338;nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D;search=jobarea%7E%60000000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21;acw_sc__v3=62cf8b2fa5bd639aae743b33e35ca7db8bf341cb;acw_tc=ac11000116577687513341042e00d72813d8c245686cadc0dfce00083a0651; ssxmod_itna=YqUxuD0DBAitKGIx0dYYIEP7qAK4rThGubR7fx3qGNdUoDZDiqAPGhDC8+KbxO8D+xCFi7P2TF8ejc4eL80rxnx8FZQpI3TDCPGnDBIx33IDYYCDt4DTD34DYDio=GyAqBQDjxAQDjlKDLDAUlD7=D4qjyjDDHKGremqGEDBtXD0QDATbOteDR9qDgzqD+9qDMl4G29kbxxiQ7eptDbrteDDNe9PxkiGn9Ap9Vfeu=MenrZQWXtaW8HEtlBM=Db4QaOWtEckN8x4+aKipsKAQt82DdCD4eQiqaj0+enA/bW2GAYbw=UYwo/rDxDW=DqV0D4D;ssxmod_itna2=YqUxuD0DBAitKGIx0dYYIEP7qAK4rThGubR7fx4A=a=i4AKD/BCxFO09c=FKAPqFnOu4+s=n7uv4CFj0gvPqeo7w0NYog0nRasC87P4r=VYmTrUUrTil4gAYIFmw6O/b4E/GGKWYLm0l2ohz4i/WfA03rK=R254iDU7W8Lo5qXg/wqo2TRcQA3sKAUc3EbBfkWBKnKoNBvC/nAxLffcKWRglR4U3lNGjEWYmgT+/LTh+ABe62DqDec=mg2=A+8pKA0RzQ5+s+ECz9ANdlfNHefX0EWzLT+L51QYBRLGjh/B57jY+XY+WwSKTEUuW4BNIRU0+Ll5tUIfQiEWG58W+Rq9Gmu+9GlKuRpKMnM+b90HjiaO8aLYFR3O3M7vI42bEimEiFpQsWQ5O0G9flxViYffEbbAOWbaWyTbmYgAbUANeWbSEcF+KLpb4wQORN68HtfaPa1IfAm8WbbKMEAngaHyWNkamEGevOa4i5nFPMN9855av3M1LkHNM8ykHyS86GG33WkF=ad/WxDKuYD7=DYKg+mx80exW5eD=", "Host": "search.51job.com", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-User": "?1", # "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0(Windows NT 10.0;Win64; x64)AppleWebKit/537.36(KHTML, like Gecko)Chrome/103.0.0.0 Safari/537.36", "sec-ch-ua": "'.Not/A)Brand';v='99','Google Chrome';v='103','Chromium';v='103'", } def run(self): # 多线程的启动方法 print("当前启动的线程为%s" % self.thread_name) # 使用nowait方法不会堵塞,当队列无数据会抛异常 while not page_flag: try: # 获取页码 page = self.page_queue.get_nowait() except: pass else: print("当前获取到的页码%s" % page) page_url = "https://search.51job.com/list/000000,000000,0000,00,9,99,python,2," + str(page) + ".html" print("当前请求的URL为:%s" % page_url) try: response = requests.get(url=page_url, headers=self.header) except Exception as e: print('e:', e) return response.encoding = 'utf-8' # print('response',response.text) self.data_queue.put(response.text) class Crawl_html(threading.Thread): # 处理页码返回数据 def __init__(self, thread_name, data_queue,lock): super(Crawl_html, self).__init__() self.thread_name = thread_name self.data_queue = data_queue self.lock=lock def run(self): print("当前处理文本任务的线程为:%s" % self.thread_name) while not data_flag: try: text = self.data_queue.get_nowait() except: pass else: result = self.parse(text) data=json.loads(result).get('engine_search_result') # 存储数据逻辑,借用一个锁,来禁止资源争抢 # print(result) with self.lock: HandleMongo().insert_data(data=data) def parse(self, text): # 文本处理 # 匹配数据的正则 search_data = re.compile(r'window\.__SEARCH_RESULT__\s=\s(.*?)</script>') # 抽取数据 data = search_data.search(text) if data: job_items = data.group(1) return job_items page_flag = False data_flag = False def main(): # 页码队列 page_queue = Queue() # 网页数据队列 data_queue = Queue() #定义锁 lock=threading.Lock() # 向页码队列中存储页码 for page in range(1, 911): # 通过put方法放入数据 page_queue.put(page) print("当前页码中存储的页码总量为%s" % page_queue.qsize()) # 爬取线程 crawlList = ["页码处理线程1号", "页码处理线程2号", "页码处理线程3号"] # 用于存储线程 page_thread_list = [] # 启动线程 for thread_name_page in crawlList: thread_page = Crawl_page(thread_name=thread_name_page, page_queue=page_queue, data_queue=data_queue) # 线程的启动 thread_page.start() page_thread_list.append(thread_page) # 文本处理线程 parseList = ["文本处理线程1号", "文本处理线程2号", "文本处理线程3号"] parse_thread_list = [] for thread_name_parse in parseList: thread_parse = Crawl_html(thread_name_parse, data_queue,lock) thread_parse.start() parse_thread_list.append(thread_parse) # 线程的停止 global page_flag, data_flag # empty判断队列是否为空 while not page_queue.empty(): pass page_flag = True # 释放线程 for thread_page_join in page_thread_list: thread_page_join.join() print(thread_page_join.thread_name, "处理结束") while not data_queue.empty(): pass data_flag = True for thread_parse_join in parse_thread_list: thread_parse_join.join() print(thread_parse_join.thread_name, "处理结束") if __name__ == '__main__': main()
import pymongo class HandleMongo(object): def __init__(self): myclient=pymongo.MongoClient("mongodb://127.0.0.1:27017") mydb=myclient['db_51job'] self.mycollection=mydb['collection_51job'] def insert_data(self,data): self.mycollection.insert_many(data)
1回答
好帮手慕小猿
2022-07-15
同学,你好!因网站有进行迭代,所以获取不到数据,同学可做如下修改:
1、将代码response.encoding=“utf-8”改为response.encoding=“gbk”避免返回数据乱码
2、获取json 字段由于“engine_search_result”改为“engine_jds”字段即可
祝学习愉快~
相似问题