fimport requests
from lxml import etree
import csv
import parser


def csv_writer(item):
    with open('qianduan4.csv', 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        try:
            writer.writerow(item)
        except Exception as e:
            print('保存错误', e)
        print('正在爬取:', item[0])


def spider(url_):
    respones = requests.get(url_, headers=headers)
    return etree.HTML(respones.text)


def parse(list_url):
    selector = spider(list_url)
    all_article = selector.xpath('//*[starts-with(@class,"excerpt excerpt-")]')
    for article in all_article:
        title = article.xpath('header/h2/a/text()')[0]
        time = article.xpath('p[1]/time/text()')[0]
        detail_url = article.xpath('header/h2/a/@href')[0]
        content = parse_detail(detail_url)
        csv_writer([title, time, content])


def parse_detail(detail_url):
    sel = spider(detail_url)
    return sel.xpath('string(//*[@class="article-content"])')


pre_url = 'http://www.daqianduan.com/page/'
all_url = [pre_url + str(x) for x in range(1, 3)]
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'}
for url in all_url:
    parse(url)

发表评论

您的电子邮箱地址不会被公开。

%d 博主赞过: