超星题库爬取
当时对象的学习通考试要我帮忙,给她搞的一个后台题库的爬取,不过学习通经常变,不同课程也不一样,有的能爬到有的就爬不到,你可以自己试试,爬之前需要拿到课程ID,一般格式如下
之后把代码里面的i换成你的课程ID
import json
import requests
import xlrd
import xlwt
from xlutils.copy import copy
import re
from urllib import request
from urllib import error
import time
cou = '0'
book_name_xls = '超星题库'
value_title = [["题号", "课程ID", "题型", "题目", "答案"], ]
urlInit = 'https://mooc1.chaoxing.com/course/{{courseId}}.html'
urlK = 'https://mooc1.chaoxing.com/nodedetailcontroller/visitnodedetail?courseId={{courseId}}&knowledgeId={{' \
'knowledgeId}} '
workUrl = 'https://mooc1.chaoxing.com/api/selectWorkQuestion?workId={{workId}}&ut=null&classId=0&courseId={{' \
'courseId}}&utenc=null '
headers = {
'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
r'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.字符甘特图'
}
answerUrl = 'http://47.112.247.80/wkapi.php?q='
def __returnWorkUrl(courseId, workId): # 返回WorkUrl网址
url = workUrl.replace('{{courseId}}', courseId).replace('{{workId}}', workId) # {{}}替换
return url
def __getRequest(url): # 请求
req = request.Request(url, headers=headers) # 加头文件
try:
page = request.urlopen(req).read() # 请求
page = page.decode('utf-8')
return page
except error.URLError as e:
print('courseId可能不存在哦!', e.reason)
def __getFristData(courseId): # 第一个data
# 组装初始URL,获取第一个包含knowledge
url = urlInit.replace('{{courseId}}', courseId) # {{}}替换
htmls = __getRequest(url)
# print(htmls)
re_rule = 'courseId=' + courseId + '&knowledgeId=(.*)">'
url_frist = re.findall(re_rule, htmls)
if len(url_frist) > 0:
return url_frist[0]
else:
print(courseId, 'courseId错误!')
def __returnTitle(courseId, knowledgeId): # 返回标题
url = urlK.replace('{{courseId}}', courseId).replace('{{knowledgeId}}', knowledgeId) # {{}}替换
htmls = __getRequest(url)
re_rule = '":"work-(.*?)"'
wordId = re.findall(re_rule, htmls) # 找出所有匹配
wordId = list(set(wordId)) # 先转集合,再转队列 去重复
print(wordId)
title = []
for x in wordId:
wordUrl = __returnWorkUrl(courseId, x)
html_work = __getRequest(wordUrl)
title_rule = '<div class="Zy_TItle clearfix">\s*<i class="fl">.*</i>\s*<div class=".*">(.*?)</div>'
title = title + re.findall(title_rule, html_work) # 找出所有匹配
re_rule = '<div id="c?o?u?r?s?e?C?h?a?p?t?e?r?S?e?l?e?c?t?e?d?" class="[\s\S]*?" data="(\d*)">?'
datas = re.findall(re_rule, htmls) # 找出所有匹配
return title, datas
def getTextByCourseId(courseId):
global cou
# try:
titles = []
data_now = __getFristData(courseId) # 第一个data需要再单独的一个链接里获取
j = 1
while data_now:
listR = __returnTitle(courseId, data_now)
title = listR[0]
data = listR[1]
for i, x in enumerate(data):
if data_now == x:
if len(data) > (i + 1):
data_now = data[i + 1]
else:
data_now = None
print('获取题目结束.')
break
# 打印题目 去除题目中的<p></p>获取其他标签,只有部分题目有,可能是尔雅自己整理时候加入的。
p = re.compile(r'[【](.*?)[】]', re.S) # 将一个字符串编译为字节代码
for t in title:
p_rule = '<.*?>'
t = re.sub(p_rule, '', t)
p_rule = '&.*?;'
t = re.sub(p_rule, '', t)
r = requests.post(answerUrl + t) # 封装
rd = json.loads(r.text) # 将json str转化成 python object
k = [j, courseId, re.findall(p, t), t, rd['answer']]
titles.append(k) # 在列表末尾添加新的对象
if cou != courseId:
write_excel_xls(book_name_xls + courseId + '.xls', courseId, value_title)
write_excel_xls_append(book_name_xls + courseId + '.xls', k)
cou = courseId
else:
write_excel_xls_append(book_name_xls + courseId + '.xls', k)
print(k)
j += 1
return titles
# except:
# return []
def write_excel_xls(path, sheet_name, value): # 编写excel
index = len(value) # 获取需要写入数据的行数
workbook = xlwt.Workbook() # 新建一个工作簿
sheet = workbook.add_sheet(sheet_name) # 在工作簿中新建一个表格
for i in range(0, index):
for j in range(0, len(value[i])):
sheet.write(i, j, value[i][j]) # 像表格中写入数据(对应的行和列)
workbook.save(path) # 保存工作簿
print("xls格式表格写入数据成功!")
def write_excel_xls_append(path, value): # 编写excel附件
index = len(value) # 获取需要写入数据的行数
workbook = xlrd.open_workbook(path) # 打开工作簿
sheets = workbook.sheet_names() # 获取工作簿中的所有表格
worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格
rows_old = worksheet.nrows # 获取表格中已存在的数据的行数
new_workbook = copy(workbook) # 将xlrd对象拷贝转化为xlwt对象
new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格
for j in range(0, len(value)):
new_worksheet.write(rows_old, j, value[j]) # 追加写入数据,注意是从i+rows_old行开始写入
new_workbook.save(path) # 保存工作簿
print("xls格式表格【追加】写入数据成功!")
def read_excel_xls(path): # 阅读excel
workbook = xlrd.open_workbook(path) # 打开工作簿
sheets = workbook.sheet_names() # 获取工作簿中的所有表格
worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格
for i in range(0, worksheet.nrows):
for j in range(0, worksheet.ncols):
print(worksheet.cell_value(i, j), "\t", end="") # 逐行逐列读取数据
# input('请输入courseId:') # '200837021' 200080607 = 189题
if __name__ == '__main__':
# 批量爬取
# i 为课程ID
# 提示:因为有些ID并不存在,爬取过程中并不会一直运行下去,如果停了需要手动再次启动
# 已爬取到的课程后若要从新启动,请更改 i 不然还会再从新爬一次
# i = 208420011 # 例如要从208422277到40000000
i = 200837021
while (i):
# write_excel_xls(book_name_xls, '1', value_title)
try:
courseId = str(i)
getTextByCourseId(courseId)
except:
print(courseId + "数据异常")
time.sleep(1) # 暂停 1 秒
i += 1
if i > 400000000:
exit()
成不成功看你运气喽,不过要是不重要的选修网课的话可以试试浏览器油猴子插件
PS:这里插一嘴,别以为选修平常考试都满分,课也都刷完了,期末考试就不重要了,我之前期末考试的时候碰上油猴子插件那段时间正好不好使,我就全选的C提交的,寻思应该不能挂,打不了分低一点,没想到他,他,他,他挂了