|
禁止求评分、诱导评分、互刷评分、互刷悬赏值,违规者封号处理。
禁止发布推广、邀请码、邀请链接、二维码或者有利益相关的任何推广行为。
所有非原创软件请发布在【精品软件区】,发帖必须按照本版块版规格式发帖。
本帖最后由 Jacklin 于 2020-10-27 14:34 编辑
ppt模板地址:http://www.ypppt.com/moban/ 这是一个免费的ppt模板网站
前两天帮朋友下述职报告,需要啥分类自己改下地址就可以用了。
代码写的渣勿喷。
附上代码:- import time
- import requests
- from scrapy.selector import Selector
-
- def RequestsDX(url):
- # 实例化一个request对象
- headers = {
- 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
- 'Chrome/86.0.4240.111 Safari/537.36 Edg/86.0.622.51',
- 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
- 'application/'
- 'signed-exchange;v=b3;q=0.9'
- }
- response = requests.get(url, headers)
- response.encoding = 'utf-8'
- return response
-
- def SelectorDX(response):
- # 实例化一个selector对象
- selector = Selector(text=response.text)
- return selector
-
- def url_page():
- # 查看模板分类下有多少页
- ppt_url = 'http://www.ypppt.com/moban/shuzhi/'
- url_list = [ppt_url]
- for i in range(2, 1000):
- ppt_url2 = 'http://www.ypppt.com/moban/shuzhi/list-%s.html' % i
- response = RequestsDX(ppt_url2)
- if response.status_code == 200:
- url_list.append(ppt_url2)
- else:
- break
- return url_list
-
- def url_page_ppt_id(selector):
- # 查看每页ppt模板的id,并合成对应ppt链接
- id = []
- page_id_sum = len(selector.xpath('/html/body/div[2]/ul/li'))
- for i in range(1, page_id_sum):
- tag = selector.xpath('/html/body/div[2]/ul/li[%s]/a' % i).extract_first()
- # 一个笨办法做的字符串过滤 拿到ppt的id 变量名起的稀碎
- y = tag.split('<')
- a = y[1].find('.html')
- b = y[1].rfind('/')
- # print(y[1][b+1:a])
- id.append(y[1][b + 1 :a])
- return id
-
- def download_page():
- # 拿到下载页面
- url_list = url_page()
- url = 'http://www.ypppt.com/p/d.php?aid='
- url_download = []
- for i in url_list:
- response = RequestsDX(i)
- selector = SelectorDX(response)
- id = url_page_ppt_id(selector)
- for i in id:
- url_download.append(url+i)
- return url_download
-
- def download_url():
- # 分析下载页面中的下载链接并下载
- url_list = []
- name_list = []
- url = download_page()
- xpath = '/html/body/div/div/ul/li[1]/a'
- filename_xpath = '/html/body/div/div/div[2]/div[2]/h1'
- for i in url:
- response = RequestsDX(i)
- selector = SelectorDX(response)
- url_download = selector.xpath(xpath).extract_first()
- file_name = selector.xpath(filename_xpath).extract_first()
- name1 = file_name.replace(' - 下载页', '')
- name2 = name1.replace('<h1>', '')
- name = name2.replace('</h1>', '')
- a = url_download.find('"')
- b = url_download.rfind('"')
- url_list.append(url_download[a + 1 :b])
- name_list.append(name)
- download_list = [name_list, url_list]
- return download_list
-
- def download():
- # 下载
- start_time = time.time()
- download_list = download_url()
- url = download_list[1]
- name = download_list[0]
- for i in range(len(url)):
- response = RequestsDX(url[i])
- print('='*100)
- print('正在下载', name[i])
- with open(r'D:\ppt\%s.zip' % name[i], 'wb') as f:
- for chunk in response.iter_content(chunk_size=1024):
- f.write(chunk)
- print('下载完成')
- end_time = time.time()
- cost = end_time - start_time
- print(cost)
- download()
复制代码 最后贴上效果图
最后有几个疑问希望能有人帮我解答下:
1、我觉着是可以下载的更快的,可能是写法问题,请大佬指教
2、能否用多线程处理多个任务这样应该能更快,但是不太熟悉多线程模块,希望能有大佬提供个思路
3、能否告知python的写法的标准,我感觉我写的变量稀碎,因为是自学的所以全部都是自由发挥 |
|