import requests
from lxml import etree

selector = etree.HTML('htm')
li_1 = selector.xpath('//text() ')
print(li_1)
''''
response = requests.get('https://ledovape.com')

selector = etree.HTML(response.text)
title_1 = selector.xpath('//text()')
print((title_1))
'''
'''
response = requests.get('http://www.douban.com')
response = requests.post('http://www.douban.com')
selector = etree.HTML(response.text)

print(response.text) # 下载网页源代码
print(response.encoding) #查看网页进制 如 'utf-8'

res = requests.get('http://www.baidu.com') #请求响应网站
res.encoding = 'utf-8' #更改网页进制
print(res.encoding)
print(res.text)
print(res.content) #查看二进制内容
'''
'''
image_url='https://www.baidu.com/img/superlogo_c4d7df0a003d3db9b65e9ef0fe6da1ec.png?where=super'
response = requests.get(image_url)
with open('baidu-logo.png', 'wb') as f:
f.write(response.content) #下载一张图片
'''
'''添加headers 头部信息,伪装成正常的浏览器
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response = requests.get('http://www.baidu.com', headers=headers)
print(response.request.headers)
print(response.history)

r =requests.get('http://www.douban.com' , timeout=3)
print(r.url) #查看最终网址
print(r.history) #查看重定向历史

r = requests.get('http://www.douban.com/search', params={'q':'python','cat':'1001'}) # 传递URL参数.
print(r.url)

from urllib.request import urlopen
html= urlopen('http://www.douban.com')
response = html.read()
print(response.decode('utf-8')) #decode('utf-8')对网页进行解码.
'''
爬取目录
import csv
import requests
from lxml import etree
pre_url = 'http://www.daqianduan.com/page/'
all_url = [pre_url + str(x) for x in range(1,73)]
for url in all_url:
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'} )
    selector = etree.HTML(response.text)
    all_article = selector.xpath('//*[starts-with(@class,"excerpt excerpt-")]')
    for article in all_article:
        title = article.xpath('header/h2/a/text()')[0]
        time = article.xpath('p[1]/time/text()')[0]
        item = [title, time]
        with open('qianduan.csv','a' ) as csvfile:
            writer = csv.writer (csvfile)
            writer.writerow(item)
            print('正在爬取', title)

发表评论

您的电子邮箱地址不会被公开。

%d 博主赞过: