关于爬虫编码的问题
来源:1-1 本周介绍
__________千
2021-12-29 18:09:33
#coding:utf-8 import random import time import requests from lxml import etree def get_data(url,headers): response = requests.get(url = url,headers = headers) response.encoding = 'utf-8' if response.status_code == 200: html = etree.HTML(response.text) all_li = html.xpath('//ul[@class=\'tj3_1\']/li') for item in all_li: new = item.xpath('./a/text()')[0] new_url = 'https://news.youth.cn/jsxw/'+item.xpath('./a/@href')[0].strip('.') date = item.xpath('./font/text()')[0] print('新闻标题为{}\n新闻url地址为{}\n新闻发布时间为{}'.format(new,new_url,date)) def main(): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36 Edg/96.0.1054.34' } for i in range(10): if i == 0: url = 'https://news.youth.cn/jsxw/index.htm' get_data(url,headers) else: url = 'https://news.youth.cn/jsxw/index_{}.htm'.format(i) get_data(url,headers) time.sleep(random.randint(2,4)) if __name__ == '__main__': main()
老师我的新闻标题为什么是乱码
1回答
时间,
2021-12-29
同学,你好!同学设置的编码格式不对,应该是'gb2312'
祝学习愉快!
相似问题