fimport requests
from lxml import etree
import csv
import parser
def csv_writer(item):
with open('qianduan4.csv', 'a', newline='') as csvfile:
writer = csv.writer(csvfile)
try:
writer.writerow(item)
except Exception as e:
print('保存错误', e)
print('正在爬取:', item[0])
def spider(url_):
respones = requests.get(url_, headers=headers)
return etree.HTML(respones.text)
def parse(list_url):
selector = spider(list_url)
all_article = selector.xpath('//*[starts-with(@class,"excerpt excerpt-")]')
for article in all_article:
title = article.xpath('header/h2/a/text()')[0]
time = article.xpath('p[1]/time/text()')[0]
detail_url = article.xpath('header/h2/a/@href')[0]
content = parse_detail(detail_url)
csv_writer([title, time, content])
def parse_detail(detail_url):
sel = spider(detail_url)
return sel.xpath('string(//*[@class="article-content"])')
pre_url = 'http://www.daqianduan.com/page/'
all_url = [pre_url + str(x) for x in range(1, 3)]
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'}
for url in all_url:
parse(url)
相关