可以爬出来url,但是最后会报错,而且无法写入数据库是什么原因
来源:7-5 实战—实现爬取信息持久化业务-1
破邪返瞳
2022-07-14 19:48:54
import json
import requests
from multiprocessing import Queue
import re
import threading
from urllib import request
import pymongo
from handle_pymongo import HandleMongo
class Crawl_page(threading.Thread):
# 用于给页码发送请求
# 请求的url:https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html
def __init__(self, thread_name, page_queue, data_queue):
super(Crawl_page, self).__init__()
# 线程名称
self.thread_name = thread_name
# 页码队列
self.page_queue = page_queue
# 数据队列
self.data_queue = data_queue
targetUrl = 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html'
# 代理服务器
proxyHost = "dyn.horocn.com"
proxyPort = "50000"
# 代理隧道验证信息
proxyUser = "6X5N1738322642599685"
proxyPass = "7fmcJrTextwKOsEw"
self.proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
"user": proxyUser,
"pass": proxyPass,
}
proxy_handler = request.ProxyHandler({
"http": self.proxyMeta,
"https": self.proxyMeta,
})
opener = request.build_opener(proxy_handler)
request.install_opener(opener)
resp = request.urlopen(targetUrl).read()
print(resp)
self.header = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate,br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Cookie": "_uab_collina=165728267287860912213435;guid=e43c9ae66458b2fbcf09698769fca338;nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D;search=jobarea%7E%60000000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21;acw_sc__v3=62cf8b2fa5bd639aae743b33e35ca7db8bf341cb;acw_tc=ac11000116577687513341042e00d72813d8c245686cadc0dfce00083a0651; ssxmod_itna=YqUxuD0DBAitKGIx0dYYIEP7qAK4rThGubR7fx3qGNdUoDZDiqAPGhDC8+KbxO8D+xCFi7P2TF8ejc4eL80rxnx8FZQpI3TDCPGnDBIx33IDYYCDt4DTD34DYDio=GyAqBQDjxAQDjlKDLDAUlD7=D4qjyjDDHKGremqGEDBtXD0QDATbOteDR9qDgzqD+9qDMl4G29kbxxiQ7eptDbrteDDNe9PxkiGn9Ap9Vfeu=MenrZQWXtaW8HEtlBM=Db4QaOWtEckN8x4+aKipsKAQt82DdCD4eQiqaj0+enA/bW2GAYbw=UYwo/rDxDW=DqV0D4D;ssxmod_itna2=YqUxuD0DBAitKGIx0dYYIEP7qAK4rThGubR7fx4A=a=i4AKD/BCxFO09c=FKAPqFnOu4+s=n7uv4CFj0gvPqeo7w0NYog0nRasC87P4r=VYmTrUUrTil4gAYIFmw6O/b4E/GGKWYLm0l2ohz4i/WfA03rK=R254iDU7W8Lo5qXg/wqo2TRcQA3sKAUc3EbBfkWBKnKoNBvC/nAxLffcKWRglR4U3lNGjEWYmgT+/LTh+ABe62DqDec=mg2=A+8pKA0RzQ5+s+ECz9ANdlfNHefX0EWzLT+L51QYBRLGjh/B57jY+XY+WwSKTEUuW4BNIRU0+Ll5tUIfQiEWG58W+Rq9Gmu+9GlKuRpKMnM+b90HjiaO8aLYFR3O3M7vI42bEimEiFpQsWQ5O0G9flxViYffEbbAOWbaWyTbmYgAbUANeWbSEcF+KLpb4wQORN68HtfaPa1IfAm8WbbKMEAngaHyWNkamEGevOa4i5nFPMN9855av3M1LkHNM8ykHyS86GG33WkF=ad/WxDKuYD7=DYKg+mx80exW5eD=",
"Host": "search.51job.com",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
# "Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0(Windows NT 10.0;Win64; x64)AppleWebKit/537.36(KHTML, like Gecko)Chrome/103.0.0.0 Safari/537.36",
"sec-ch-ua": "'.Not/A)Brand';v='99','Google Chrome';v='103','Chromium';v='103'",
}
def run(self):
# 多线程的启动方法
print("当前启动的线程为%s" % self.thread_name)
# 使用nowait方法不会堵塞,当队列无数据会抛异常
while not page_flag:
try:
# 获取页码
page = self.page_queue.get_nowait()
except:
pass
else:
print("当前获取到的页码%s" % page)
page_url = "https://search.51job.com/list/000000,000000,0000,00,9,99,python,2," + str(page) + ".html"
print("当前请求的URL为:%s" % page_url)
try:
response = requests.get(url=page_url, headers=self.header)
except Exception as e:
print('e:', e)
return
response.encoding = 'utf-8'
# print('response',response.text)
self.data_queue.put(response.text)
class Crawl_html(threading.Thread):
# 处理页码返回数据
def __init__(self, thread_name, data_queue,lock):
super(Crawl_html, self).__init__()
self.thread_name = thread_name
self.data_queue = data_queue
self.lock=lock
def run(self):
print("当前处理文本任务的线程为:%s" % self.thread_name)
while not data_flag:
try:
text = self.data_queue.get_nowait()
except:
pass
else:
result = self.parse(text)
data=json.loads(result).get('engine_search_result')
# 存储数据逻辑,借用一个锁,来禁止资源争抢
# print(result)
with self.lock:
HandleMongo().insert_data(data=data)
def parse(self, text):
# 文本处理
# 匹配数据的正则
search_data = re.compile(r'window\.__SEARCH_RESULT__\s=\s(.*?)</script>')
# 抽取数据
data = search_data.search(text)
if data:
job_items = data.group(1)
return job_items
page_flag = False
data_flag = False
def main():
# 页码队列
page_queue = Queue()
# 网页数据队列
data_queue = Queue()
#定义锁
lock=threading.Lock()
# 向页码队列中存储页码
for page in range(1, 911):
# 通过put方法放入数据
page_queue.put(page)
print("当前页码中存储的页码总量为%s" % page_queue.qsize())
# 爬取线程
crawlList = ["页码处理线程1号", "页码处理线程2号", "页码处理线程3号"]
# 用于存储线程
page_thread_list = []
# 启动线程
for thread_name_page in crawlList:
thread_page = Crawl_page(thread_name=thread_name_page, page_queue=page_queue, data_queue=data_queue)
# 线程的启动
thread_page.start()
page_thread_list.append(thread_page)
# 文本处理线程
parseList = ["文本处理线程1号", "文本处理线程2号", "文本处理线程3号"]
parse_thread_list = []
for thread_name_parse in parseList:
thread_parse = Crawl_html(thread_name_parse, data_queue,lock)
thread_parse.start()
parse_thread_list.append(thread_parse)
# 线程的停止
global page_flag, data_flag
# empty判断队列是否为空
while not page_queue.empty():
pass
page_flag = True
# 释放线程
for thread_page_join in page_thread_list:
thread_page_join.join()
print(thread_page_join.thread_name, "处理结束")
while not data_queue.empty():
pass
data_flag = True
for thread_parse_join in parse_thread_list:
thread_parse_join.join()
print(thread_parse_join.thread_name, "处理结束")
if __name__ == '__main__':
main()import pymongo
class HandleMongo(object):
def __init__(self):
myclient=pymongo.MongoClient("mongodb://127.0.0.1:27017")
mydb=myclient['db_51job']
self.mycollection=mydb['collection_51job']
def insert_data(self,data):
self.mycollection.insert_many(data)

1回答
好帮手慕小猿
2022-07-15
同学,你好!因网站有进行迭代,所以获取不到数据,同学可做如下修改:
1、将代码response.encoding=“utf-8”改为response.encoding=“gbk”避免返回数据乱码
2、获取json 字段由于“engine_search_result”改为“engine_jds”字段即可
祝学习愉快~
相似问题