2020-02-29 19:08
这是一个测试评论
坐在图书馆摸鱼学习的时候,想下载老师的整个PPT,不过点击好麻烦啊...
话不多说直接上代码
import requests
import time
from bs4 import BeautifulSoup
import os
import json
from urllib import parse
# 注意是Python3的代码
#获取验证码
print("正在获取验证码......")
session = requests.session()
session.get("http://jwc.swjtu.edu.cn/service/login.html")
r = session.get("http://jwc.swjtu.edu.cn/vatuu/GetRandomNumberToJPEG?test="+str(int(time.time())))
with open("code.jpg", "wb") as f:
f.write(r.content)
os.startfile("code.jpg")
#获取用户信息
yzm = input("请输入验证码:")
username = input("请输入用户名:")
password = input("请输入密码:")
#模拟登陆
#第一步,POST发送
sendmsg = {
'username': username,
'password': password,
'url': 'http://jwc.swjtu.edu.cn/vatuu/UserExitAction&returnUrl',
'area': '',
'ranstring': yzm,
}
login_header = {
'Referer': 'http://jwc.swjtu.edu.cn/service/login.html',
'Origin': 'http://jwc.swjtu.edu.cn',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
'DNT': '1',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With': 'XMLHttpRequest',
}
r = session.post("http://jwc.swjtu.edu.cn/vatuu/UserLoginAction", data=sendmsg, headers=login_header)
json = json.loads(r.text)
print(json['loginMsg'])
#第二步,确认登陆
sendmsg = {
'url': 'http://jwc.swjtu.edu.cn/vatuu/UserExitAction&returnUrl',
'returnUrl': '',
'loginMsg': json['loginMsg']
}
login_header = {
'Referer': 'http://jwc.swjtu.edu.cn/vatuu/StudentScoreInfoAction?setAction=studentMarkUseProgram',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
'DNT': '1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Upgrade-Insecure-Requests': '1',
'Accept-Encoding': 'deflate',
'Accept-Language': 'zh-CN,zh;q=0.9'
}
r = session.post("http://jwc.swjtu.edu.cn/vatuu/UserLoadingAction", data=sendmsg, headers=login_header)
if '系统正在准备中,请稍候' in r.text:
print('成功登陆!')
#已经成功登陆
r = session.get('http://jwc.swjtu.edu.cn/vatuu/StudentTeachResourceAction?setAction=teachCourse')
soup = BeautifulSoup(r.text, "lxml")
for i in soup.find_all('a', "btn btn-mini btn-blue"):
# 获取课程访问的URL
url = str(i.get('href'))
url = url.replace('..', 'http://jwc.swjtu.edu.cn')
# 从URL中解析出课程名称
result = parse.urlparse(url)
query_dict = parse.parse_qs(result.query)
courseName = query_dict['courseName'][0]
# 建立课程文件夹
try:
os.makedirs(courseName)
except:
print('%s 课程文件夹已存在,跳过该课程...'%courseName)
continue
r = session.get(url)
text = r.text.replace('\n', '')
text = text.replace('\r', '')
tmp = BeautifulSoup(text, "lxml")
for k in tmp.find_all('a', string="查看"):
# 依次取出
# 取出教师名称
c_time = k.parent.previous_sibling.previous_sibling.string
teacher = k.parent.previous_sibling.previous_sibling.previous_sibling.previous_sibling.string
url = str(k.get('href'))
url = url.replace('..', 'http://jwc.swjtu.edu.cn')
# 访问下载界面
r = session.get(url)
temp = BeautifulSoup(r.text, "lxml")
tag = temp.find('a', string="点击下载")
url = str(tag.get('href'))
url = url.replace('..', 'http://jwc.swjtu.edu.cn')
# 获取文件名称
name_tag = temp.find('td', style="color: #0EA33F")
name = name_tag.string
# 开始下载
print('开始下载:%s-%s-%s'%(courseName, teacher, name))
wj = session.get(url)
name = str(name).replace('\\', '-')
name = str(name).replace('..', '-')
with open("%s\\%s-%s-%s"%(courseName, teacher, c_time, name), "wb") as f:
print('下载完成:%s-%s-%s' % (courseName, teacher, name))
f.write(wj.content)
代码写的很简单 但用起来没什么问题
单线程爬取和下载,有蛋疼热心的同学可改成异步或多线程的
顺便吐槽一下
所有科目合起来1.9G
马原+毛概+形政=1.6G
本文作者:卖女孩的小火柴 - 搬砖中
本文链接:https://www.shinenet.cn/archives/79.html
最后修改时间:2019-12-23 19:32:05
本站未注明转载的文章均为原创,并采用 CC BY-NC-SA 4.0 授权协议,转载请注明来源,谢谢!