老师给看一下这个问题:百度说要转换格式? 可是之前好像没有吧

来源:6-2 实战—selenium实现51job全站点岗位信息自动化抓取-2

覀丶贝

2020-07-10 18:47:35

#这是主代码

from selenium import webdriver
from selenium.webdriver.common.by import By
#等待用包(主要)
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import etree
import time
from handle_selenium_MongoDB import insert_data

class Handle_webdriver(object):
   def __init__(self):
       #Chrome  C是大写
       self.driver = webdriver.Chrome("D:\Web-Driver\chromedriver.exe")
       # 设置最大窗口
       self.driver.maximize_window()

   def handle_job(self):
       # 获得请求网页
       self.driver.get("https://search.51job.com/list/000000,000000,0000,00,9,99,%2520,2,1.html")
       #通过WebDriverWait进行等待,等待检查这个搜索框是否存在
       if WebDriverWait(self.driver,5,0.5).until(EC.presence_of_element_located((By.ID,'kwdselectid'))):
           # 从外部获取输入的岗位信息
           input_keyword = input('请输入要查找的岗位信息:')
           # 通过 find_element_by_id 找到搜索框控件
           # 通过 send_keys 发送要查找的信息岗位
           self.driver.find_element_by_id('kwdselectid').send_keys(input_keyword)
           # 点击搜索 (通过class_name)
           self.driver.find_element_by_class_name('p_but').click()
           # time.sleep(5)
           # self.driver.quit()
           # 这个是职位框框
           if WebDriverWait(self.driver, 5, 0.5).until(EC.presence_of_element_located((By.ID, 'resultList'))):
               # 查看网页源代码
               # print(self.driver.page_source)
               self.handle_parse(self.driver.page_source)


   def handle_parse(self,page_source):
       html_51job = etree.HTML(page_source)
       # 单引号双引号要注意
       all_div = html_51job.xpath("//div[@id='resultList']//div[@class='el']")
       info_list = []
       for item in all_div:
           info = {}
           # 这个.非常的重要,代表我们使用的是item下的xpath语句,不要把.丢了
           # 获取数据的时候,要使用列表索引为0的数据
           info['job_name'] = item.xpath("./p/span/a/@title")[0]
           info['company_name'] = item.xpath(".//span[@class='t2']/a/@title")[0]
           # 把下面这三个字段补齐
           info['company_address'] = item.xpath(".//span[@class='t3']/text()")[0]
           # money字段可能为空,try,except来进行异常处理
           try:
               info['money'] = item.xpath(".//span[@class='t4']/text()")[0]
           except IndexError:
               info['money'] = '无数据'
           info['date'] = item.xpath(".//span[@class='t5']/text()")[0]
           # print(info)
           info_list.append(info)
       # 想看结果,可以在控制台打印输出
       # for i in info_list:
       #     print(i)

       #将数据转移到 mongoDB 数据库中
       insert_data.insert_db(info)

test_selenium = Handle_webdriver()
test_selenium.handle_job()

#这是数据库代码

import pymongo

class Mongo_Client(object):
   def __init__(self):
       myclient = pymongo.MongoClient("mongodb://192.168.0.105:27017")
       mydb = myclient['db_selenium_51job']
       self.mycollection = mydb['colc_selenium_51job']

   def insert_db(self,item):
       self.mycollection.insert_many(item)

insert_data = Mongo_Client()


#这是报错信息:

请输入要查找的岗位信息:java
Traceback (most recent call last):
  File "D:/Spyder_Exer/Selenium_51Job/handle_selenium_51job.py", line 67, in <module>
    test_selenium.handle_job()
  File "D:/Spyder_Exer/Selenium_51Job/handle_selenium_51job.py", line 35, in handle_job
    self.handle_parse(self.driver.page_source)
  File "D:/Spyder_Exer/Selenium_51Job/handle_selenium_51job.py", line 64, in handle_parse
    insert_data.insert_db(info)
  File "D:\Spyder_Exer\Selenium_51Job\handle_selenium_MongoDB.py", line 10, in insert_db
    self.mycollection.insert_many(item)
  File "D:\Spyder_Exer\Selenium_51Job\lib\site-packages\pymongo\collection.py", line 757, in insert_many
    blk.ops = [doc for doc in gen()]
  File "D:\Spyder_Exer\Selenium_51Job\lib\site-packages\pymongo\collection.py", line 757, in <listcomp>
    blk.ops = [doc for doc in gen()]
  File "D:\Spyder_Exer\Selenium_51Job\lib\site-packages\pymongo\collection.py", line 748, in gen
    common.validate_is_document_type("document", document)
  File "D:\Spyder_Exer\Selenium_51Job\lib\site-packages\pymongo\common.py", line 483, in validate_is_document_type
    "collections.MutableMapping" % (option,))
TypeError: document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping

Process finished with exit code 1


写回答

1回答

时间,

2020-07-10

同学,你好。在插入数据时,传入的参数应该是info_list

http://img.mukewang.com/climg/5f085077092273b810580283.jpg

http://img.mukewang.com/climg/5f08510b090ced3809670240.jpg

如果我的回答解决了您的疑惑,请采纳!祝学习愉快~~~~

0

0 学习 · 1672 问题

查看课程