添加 main.py
This commit is contained in:
100
main.py
Normal file
100
main.py
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding:utf-8 -*-
|
||||||
|
# author:xjl
|
||||||
|
# datetime:2019/12/30 19:39
|
||||||
|
# software: PyCharm
|
||||||
|
"""自己手动实现一个下载目标url文档的代码
|
||||||
|
1 找到目标的网址
|
||||||
|
2 在本地新建文件夹
|
||||||
|
3 保存到文件夹中
|
||||||
|
|
||||||
|
"""
|
||||||
|
import requests
|
||||||
|
import urllib
|
||||||
|
from urllib.request import urlretrieve
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# 目标URL
|
||||||
|
url = 'https://www.kancloud.cn/niucloud/niushop_b2c_v5/3037617'
|
||||||
|
nextUrl = 'https://www.kancloud.cn/niucloud/niushop_b2c_v5/'
|
||||||
|
# 请求头
|
||||||
|
headers = {
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||||
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||||
|
'Cache-Control': 'no-cache',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
'Cookie': 'fpv=a6446d0e9f72400f153461652c0c3b92; PHPSESSID=tkue2dl2n1e9t4pcc2ipd84g6e; bc_bot_session=173338642649f18cac8b744232; yd_captcha_token=MTczMzM4NjQyODc4M18xMTkuMTYyLjIwMy4yNDJfYWJiM2YzMDc4ZDExNDIyNGQwYTkwNDJkMmYyYzI0ZjQ4Yw%3D%3D; bc_bot_token=100173338642649f18cac8b744232a2ac68; bc_bot_rules=-; bc_bot_fp=9095436cc1d5247f67bfab1401ac4673; _aihecong_chat_visibility=false; waf_captcha_marker=0754271cde63977f720eee726a56820226aaa222eb891811e5eb25a966ef7a4a',
|
||||||
|
'Pragma': 'no-cache',
|
||||||
|
'Referer': 'https://www.kancloud.cn/niucloud/niushop_b2c_v5/3037616',
|
||||||
|
'Sec-Fetch-Dest': 'document',
|
||||||
|
'Sec-Fetch-Mode': 'navigate',
|
||||||
|
'Sec-Fetch-Site': 'same-origin',
|
||||||
|
'Sec-Fetch-User': '?1',
|
||||||
|
'Upgrade-Insecure-Requests': '1',
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
||||||
|
'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
|
||||||
|
'sec-ch-ua-mobile': '?0',
|
||||||
|
'sec-ch-ua-platform': '"macOS"',
|
||||||
|
}
|
||||||
|
|
||||||
|
# 发送请求
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
|
||||||
|
# 打印响应内容
|
||||||
|
print(response.text) # 或者 response.content 对于二进制内容
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
# # ^abc.*?qwe$
|
||||||
|
li_list = soup.find('div', class_='catalog').find_all('li')
|
||||||
|
print(li_list)
|
||||||
|
|
||||||
|
# 查找 class 为 'content' 的 div 元素
|
||||||
|
# content_div = soup.find('div', class_='content')
|
||||||
|
|
||||||
|
# if content_div:
|
||||||
|
# 获取 div 中的 HTML 内容
|
||||||
|
# content_html = content_div.decode_contents()
|
||||||
|
#
|
||||||
|
# # 指定保存的目录和文件名
|
||||||
|
# save_dir = './output' # 你可以修改这个目录
|
||||||
|
# os.makedirs(save_dir, exist_ok=True) # 如果目录不存在,创建目录
|
||||||
|
# file_path = os.path.join(save_dir, 'index.md')
|
||||||
|
#
|
||||||
|
# # 将内容保存到指定文件
|
||||||
|
# with open(file_path, 'w', encoding='utf-8') as f:
|
||||||
|
# f.write(content_html)
|
||||||
|
# print(f"内容已保存到 {file_path}")
|
||||||
|
# else:
|
||||||
|
# print("没有找到 class='content' 的 div 元素")
|
||||||
|
|
||||||
|
for tab in li_list:
|
||||||
|
short_href = tab.find('a').attrs['href']
|
||||||
|
name = tab.find('a').text
|
||||||
|
# # name = name.replace('/', '-')
|
||||||
|
# # file_down(clear_url + short_href, outputdir + short_href + ".html", short_href)
|
||||||
|
print(name)
|
||||||
|
# print(short_href)
|
||||||
|
response = requests.get(nextUrl + short_href, headers=headers)
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
# 查找 class 为 'content' 的 div 元素
|
||||||
|
content_div = soup.find('div', class_='content')
|
||||||
|
|
||||||
|
if content_div:
|
||||||
|
# 获取 div 中的 HTML 内容
|
||||||
|
content_html = content_div.decode_contents()
|
||||||
|
|
||||||
|
# 指定保存的目录和文件名
|
||||||
|
save_dir = './output' # 你可以修改这个目录
|
||||||
|
os.makedirs(save_dir, exist_ok=True) # 如果目录不存在,创建目录
|
||||||
|
file_path = os.path.join(save_dir, name + '_' + short_href + '.md')
|
||||||
|
|
||||||
|
# 将内容保存到指定文件
|
||||||
|
with open(file_path, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(content_html)
|
||||||
|
print(f"内容已保存到 {file_path}")
|
||||||
|
else:
|
||||||
|
print("没有找到 class='content' 的 div 元素")
|
||||||
Reference in New Issue
Block a user