共计 2301 个字符,预计需要花费 6 分钟才能阅读完成。
心血来潮做一个小爬虫练习一下,爬取第一PPT站点,根据分类,自动下载分类下面的全部PPT压缩包
本爬虫涉及的库:
os:系统库,用来操作文件夹和判断文件是否存在
pathlib:比 os.path 好用的处理路径的库
tqdm:进度处理的库,更方便的显示文件下载进度
requests:网络获取库
pyquery:类似jQuery的HTML分析库
下面直接贴代码
import os
import requests
from pyquery import PyQuery as pq
from pathlib import Path
from tqdm import tqdm
baseUrl = 'http://www.1ppt.com'
def down_from_url(url, dst):
response = requests.get(url, stream=True) # (1)
file_size = int(response.headers['content-length']) # (2)
if os.path.exists(dst):
first_byte = os.path.getsize(dst) # (3)
else:
first_byte = 0
if first_byte >= file_size: # (4)
print('文件存在:' + dst.name)
return file_size
header = {"Range": f"bytes={first_byte}-{file_size}"}
# pbar = tqdm(total=file_size, initial=first_byte, unit='B', unit_scale=True, desc=dst)
pbar = tqdm(total=file_size, initial=first_byte, unit='MB', unit_scale=True,desc=dst.name)
req = requests.get(url, headers=header, stream=True)
with open(dst, 'ab') as f:
for chunk in req.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(1024)
pbar.close()
return file_size
def downFile(pageUrl):
url = baseUrl + pageUrl
print('处理 :' + url)
doc = pq(url, encoding='gb2312')
downUrl = doc('ul.downurllist a').attr('href')
print('下载地址:' + downUrl)
savePath = 'D:\\ppt\\'
if not os.path.exists(savePath):
os.makedirs(savePath)
fileName = Path(savePath) / downUrl.split('/')[-1] # 截取、拼接下载文件名
down_from_url(downUrl,fileName)
def getArticle(doc):
for i, a in enumerate(doc('ul.tplist > li > a').items()):
downFile(a.attr('href'))
def getPagesHtml(pageUrls):
for obj in pageUrls:
if(obj < 2):
print('开始处理第' + str(obj) + '页')
doc = pageUrls[obj]
else:
print('开始处理第'+str(obj)+'页 ' + pageUrls[obj])
doc = pq(baseUrl + pageUrls[obj],encoding='gb2312')
getArticle(doc)
doc = pq(baseUrl + '/hangye/',encoding='gb2312')
# print(doc('.col_nav li'))
lis = {}
for i,li in enumerate(doc('.col_nav li').items()):
if(i < 1):
continue
lis[i] = {
'name': li.find('a').text(),
'url': li.find('a').attr('href')
}
print(str(i) + ':' + li.text())
xuhaoOk = True
xuhao = ''
while xuhaoOk:
xuhao = input('请输入获取序号:')
if not xuhao.isdigit():
print('请输入正确的数字序号')
continue
if int(xuhao) not in lis:
print('请输入正确的序号')
continue
xuhaoOk = False
print('开始下载分类:' + lis[int(xuhao)]['name'])
doc = pq(baseUrl + lis[int(xuhao)]['url'],encoding='gb2312')
print('本分类共'+ str(doc('ul.pages li').length - 3) +'页')
pageUrls = {1:doc}
for i,a in enumerate(doc('ul.pages li a').items()):
if not a.text().isdigit():
continue
pageUrls[int(a.text())] = lis[int(xuhao)]['url'] + a.attr('href')
# print('开始处理第一页')
# print(pageUrls)
getPagesHtml(pageUrls)
正文完