关于爬取数据突然出现问题
来源:6-2 实战—selenium实现51job全站点岗位信息自动化抓取-2
可可丶泗水
2020-05-14 14:19:43
class handle_webDriver(object):
def __init__(self):
# 设置无头浏览器
chrome_options = Options()
chrome_options.add_argument('--headless')
self.driver = webdriver.Chrome(executable_path='chromedriver.exe')
self.driver.maximize_window()
def handle_job(self):
self.driver.get(
'https://search.51job.com/list/000000,000000,0000,00,9,99,%2520,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=')
# 显示等待搜索框是否存在
if WebDriverWait(self.driver, 5, 0.5).until(EC.presence_of_all_elements_located((By.ID, 'kwdselectid'))):
input_keys = input("请输入要查看的职位名称:")
self.driver.find_element_by_id('kwdselectid').send_keys(input_keys)
self.driver.find_element_by_xpath('//button[@class="p_but"]').click()
if WebDriverWait(self.driver, 5, 0.5).until(EC.presence_of_all_elements_located((By.ID, 'resultList'))):
while True:
try:
self.handle_parse(self.driver.page_source)
if self.driver.find_element_by_xpath('//li[@class="bk"][2]/a').text == '下一页':
self.driver.find_element_by_xpath('//li[@class="bk"][2]/a').click()
except:
break
# time.sleep(5)
self.driver.quit()
def handle_parse(self, page_source):
# 将源代码进行封装
html_51job = etree.HTML(page_source)
# 获取整体的div
all_div = html_51job.xpath('//div[@id="resultList"]//div[@class="el"]')
for item in all_div:
info = {
'job_name': item.xpath('.//p/span/a/@title'),
'company_name': item.xpath('.//span[@class="t2"]/a/@title'),
'company_add': item.xpath('.//span[@class="t3"]/text()'),
'date': item.xpath('.//span[@class="t5"]/text()')
}
# 薪资有可能为空
try:
info['money'] = item.xpath('//span[@class="t4"]/text()')
except:
info['money'] = '面议'
print(info)
selenium_mongo.insert_db(info)
if __name__ == '__main__':
web = handle_webDriver()
web.handle_job()
import pymongo
class Selenium_Mongo(object):
def __init__(self):
MyClient = pymongo.MongoClient('mongodb://localhost:27017')
my_db = MyClient['selenium_51job']
self.mycollection = my_db['collection_selenium_51job']
def insert_db(self,items):
self.mycollection.insert_many(items)
selenium_mongo = Selenium_Mongo()
老师,我这个代码应该没有问题把,为啥爬取不到数据了,是不是反爬机制造成的
运行结果如下:
1回答
好帮手慕笑蓉
2020-05-14
同学,你好。老师这边测试了同学的代码,是没有问题的,数据可以正常爬取,同学可以尝试换个职位试一试。
如果解决了你的疑惑,请采纳,祝学习愉快~
相似问题