老师给看一下这个问题：百度说要转换格式？可是之前好像没有吧

来源：6-2 实战—selenium实现51job全站点岗位信息自动化抓取-2

覀丶贝

2020-07-10 18:47:35

#这是主代码

from selenium import webdriver
from selenium.webdriver.common.by import By
#等待用包（主要）
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import etree
import time
from handle_selenium_MongoDB import insert_data

class Handle_webdriver(object):
def __init__(self):
#Chrome C是大写
self.driver = webdriver.Chrome("D:\Web-Driver\chromedriver.exe")
# 设置最大窗口
self.driver.maximize_window()

def handle_job(self):
# 获得请求网页
self.driver.get("https://search.51job.com/list/000000,000000,0000,00,9,99,%2520,2,1.html")
#通过WebDriverWait进行等待，等待检查这个搜索框是否存在
if WebDriverWait(self.driver,5,0.5).until(EC.presence_of_element_located((By.ID,'kwdselectid'))):
# 从外部获取输入的岗位信息
input_keyword = input('请输入要查找的岗位信息：')
# 通过 find_element_by_id 找到搜索框控件
# 通过 send_keys 发送要查找的信息岗位
self.driver.find_element_by_id('kwdselectid').send_keys(input_keyword)
# 点击搜索（通过class_name）
self.driver.find_element_by_class_name('p_but').click()
# time.sleep(5)
# self.driver.quit()
# 这个是职位框框
if WebDriverWait(self.driver, 5, 0.5).until(EC.presence_of_element_located((By.ID, 'resultList'))):
# 查看网页源代码
# print(self.driver.page_source)
self.handle_parse(self.driver.page_source)

def handle_parse(self,page_source):
html_51job = etree.HTML(page_source)
# 单引号双引号要注意
all_div = html_51job.xpath("//div[@id='resultList']//div[@class='el']")
info_list = []
for item in all_div:
info = {}
# 这个.非常的重要，代表我们使用的是item下的xpath语句,不要把.丢了
# 获取数据的时候，要使用列表索引为0的数据
info['job_name'] = item.xpath("./p/span/a/@title")[0]
info['company_name'] = item.xpath(".//span[@class='t2']/a/@title")[0]
# 把下面这三个字段补齐
info['company_address'] = item.xpath(".//span[@class='t3']/text()")[0]
# money字段可能为空，try,except来进行异常处理
try:
info['money'] = item.xpath(".//span[@class='t4']/text()")[0]
except IndexError:
info['money'] = '无数据'
info['date'] = item.xpath(".//span[@class='t5']/text()")[0]
# print(info)
info_list.append(info)
# 想看结果，可以在控制台打印输出
# for i in info_list:
# print(i)

#将数据转移到 mongoDB 数据库中
insert_data.insert_db(info)

test_selenium = Handle_webdriver()
test_selenium.handle_job()

#这是数据库代码

import pymongo

class Mongo_Client(object):
def __init__(self):
myclient = pymongo.MongoClient("mongodb://192.168.0.105:27017")
mydb = myclient['db_selenium_51job']
self.mycollection = mydb['colc_selenium_51job']

def insert_db(self,item):
self.mycollection.insert_many(item)

insert_data = Mongo_Client()

#这是报错信息：

请输入要查找的岗位信息：java
Traceback (most recent call last):
File "D:/Spyder_Exer/Selenium_51Job/handle_selenium_51job.py", line 67, in <module>
    test_selenium.handle_job()
File "D:/Spyder_Exer/Selenium_51Job/handle_selenium_51job.py", line 35, in handle_job
    self.handle_parse(self.driver.page_source)
File "D:/Spyder_Exer/Selenium_51Job/handle_selenium_51job.py", line 64, in handle_parse
    insert_data.insert_db(info)
File "D:\Spyder_Exer\Selenium_51Job\handle_selenium_MongoDB.py", line 10, in insert_db
    self.mycollection.insert_many(item)
File "D:\Spyder_Exer\Selenium_51Job\lib\site-packages\pymongo\collection.py", line 757, in insert_many
    blk.ops = [doc for doc in gen()]
File "D:\Spyder_Exer\Selenium_51Job\lib\site-packages\pymongo\collection.py", line 757, in <listcomp>
    blk.ops = [doc for doc in gen()]
File "D:\Spyder_Exer\Selenium_51Job\lib\site-packages\pymongo\collection.py", line 748, in gen
    common.validate_is_document_type("document", document)
File "D:\Spyder_Exer\Selenium_51Job\lib\site-packages\pymongo\common.py", line 483, in validate_is_document_type
    "collections.MutableMapping" % (option,))
TypeError: document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping