老师给看一下这个问题:百度说要转换格式? 可是之前好像没有吧
来源:6-2 实战—selenium实现51job全站点岗位信息自动化抓取-2
覀丶贝
2020-07-10 18:47:35
#这是主代码
from selenium import webdriver
from selenium.webdriver.common.by import By
#等待用包(主要)
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import etree
import time
from handle_selenium_MongoDB import insert_data
class Handle_webdriver(object):
def __init__(self):
#Chrome C是大写
self.driver = webdriver.Chrome("D:\Web-Driver\chromedriver.exe")
# 设置最大窗口
self.driver.maximize_window()
def handle_job(self):
# 获得请求网页
self.driver.get("https://search.51job.com/list/000000,000000,0000,00,9,99,%2520,2,1.html")
#通过WebDriverWait进行等待,等待检查这个搜索框是否存在
if WebDriverWait(self.driver,5,0.5).until(EC.presence_of_element_located((By.ID,'kwdselectid'))):
# 从外部获取输入的岗位信息
input_keyword = input('请输入要查找的岗位信息:')
# 通过 find_element_by_id 找到搜索框控件
# 通过 send_keys 发送要查找的信息岗位
self.driver.find_element_by_id('kwdselectid').send_keys(input_keyword)
# 点击搜索 (通过class_name)
self.driver.find_element_by_class_name('p_but').click()
# time.sleep(5)
# self.driver.quit()
# 这个是职位框框
if WebDriverWait(self.driver, 5, 0.5).until(EC.presence_of_element_located((By.ID, 'resultList'))):
# 查看网页源代码
# print(self.driver.page_source)
self.handle_parse(self.driver.page_source)
def handle_parse(self,page_source):
html_51job = etree.HTML(page_source)
# 单引号双引号要注意
all_div = html_51job.xpath("//div[@id='resultList']//div[@class='el']")
info_list = []
for item in all_div:
info = {}
# 这个.非常的重要,代表我们使用的是item下的xpath语句,不要把.丢了
# 获取数据的时候,要使用列表索引为0的数据
info['job_name'] = item.xpath("./p/span/a/@title")[0]
info['company_name'] = item.xpath(".//span[@class='t2']/a/@title")[0]
# 把下面这三个字段补齐
info['company_address'] = item.xpath(".//span[@class='t3']/text()")[0]
# money字段可能为空,try,except来进行异常处理
try:
info['money'] = item.xpath(".//span[@class='t4']/text()")[0]
except IndexError:
info['money'] = '无数据'
info['date'] = item.xpath(".//span[@class='t5']/text()")[0]
# print(info)
info_list.append(info)
# 想看结果,可以在控制台打印输出
# for i in info_list:
# print(i)
#将数据转移到 mongoDB 数据库中
insert_data.insert_db(info)
test_selenium = Handle_webdriver()
test_selenium.handle_job()
#这是数据库代码
import pymongo
class Mongo_Client(object):
def __init__(self):
myclient = pymongo.MongoClient("mongodb://192.168.0.105:27017")
mydb = myclient['db_selenium_51job']
self.mycollection = mydb['colc_selenium_51job']
def insert_db(self,item):
self.mycollection.insert_many(item)
insert_data = Mongo_Client()
#这是报错信息:
请输入要查找的岗位信息:java
Traceback (most recent call last):
File "D:/Spyder_Exer/Selenium_51Job/handle_selenium_51job.py", line 67, in <module>
test_selenium.handle_job()
File "D:/Spyder_Exer/Selenium_51Job/handle_selenium_51job.py", line 35, in handle_job
self.handle_parse(self.driver.page_source)
File "D:/Spyder_Exer/Selenium_51Job/handle_selenium_51job.py", line 64, in handle_parse
insert_data.insert_db(info)
File "D:\Spyder_Exer\Selenium_51Job\handle_selenium_MongoDB.py", line 10, in insert_db
self.mycollection.insert_many(item)
File "D:\Spyder_Exer\Selenium_51Job\lib\site-packages\pymongo\collection.py", line 757, in insert_many
blk.ops = [doc for doc in gen()]
File "D:\Spyder_Exer\Selenium_51Job\lib\site-packages\pymongo\collection.py", line 757, in <listcomp>
blk.ops = [doc for doc in gen()]
File "D:\Spyder_Exer\Selenium_51Job\lib\site-packages\pymongo\collection.py", line 748, in gen
common.validate_is_document_type("document", document)
File "D:\Spyder_Exer\Selenium_51Job\lib\site-packages\pymongo\common.py", line 483, in validate_is_document_type
"collections.MutableMapping" % (option,))
TypeError: document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Process finished with exit code 1
1回答
时间,
2020-07-10
同学,你好。在插入数据时,传入的参数应该是info_list


如果我的回答解决了您的疑惑,请采纳!祝学习愉快~~~~
相似问题