超星题库爬取

当时对象的学习通考试要我帮忙,给她搞的一个后台题库的爬取,不过学习通经常变,不同课程也不一样,有的能爬到有的就爬不到,你可以自己试试,爬之前需要拿到课程ID,一般格式如下
Snipaste_2021-02-25_17-29-00.png

之后把代码里面的i换成你的课程ID

import json

import requests
import xlrd
import xlwt
from xlutils.copy import copy
import re
from urllib import request
from urllib import error

import time

cou = '0'

book_name_xls = '超星题库'

value_title = [["题号", "课程ID", "题型", "题目", "答案"], ]

urlInit = 'https://mooc1.chaoxing.com/course/{{courseId}}.html'
urlK = 'https://mooc1.chaoxing.com/nodedetailcontroller/visitnodedetail?courseId={{courseId}}&knowledgeId={{' \
       'knowledgeId}} '
workUrl = 'https://mooc1.chaoxing.com/api/selectWorkQuestion?workId={{workId}}&ut=null&classId=0&courseId={{' \
          'courseId}}&utenc=null '
headers = {
    'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  r'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.字符甘特图'
}
answerUrl = 'http://47.112.247.80/wkapi.php?q='


def __returnWorkUrl(courseId, workId):  # 返回WorkUrl网址
    url = workUrl.replace('{{courseId}}', courseId).replace('{{workId}}', workId)  # {{}}替换
    return url


def __getRequest(url):  # 请求
    req = request.Request(url, headers=headers)  # 加头文件
    try:
        page = request.urlopen(req).read()  # 请求
        page = page.decode('utf-8')
        return page
    except error.URLError as e:
        print('courseId可能不存在哦!', e.reason)


def __getFristData(courseId):  # 第一个data
    # 组装初始URL,获取第一个包含knowledge
    url = urlInit.replace('{{courseId}}', courseId)  # {{}}替换

    htmls = __getRequest(url)
    # print(htmls)
    re_rule = 'courseId=' + courseId + '&knowledgeId=(.*)">'
    url_frist = re.findall(re_rule, htmls)

    if len(url_frist) > 0:
        return url_frist[0]
    else:
        print(courseId, 'courseId错误!')


def __returnTitle(courseId, knowledgeId):  # 返回标题
    url = urlK.replace('{{courseId}}', courseId).replace('{{knowledgeId}}', knowledgeId)  # {{}}替换
    htmls = __getRequest(url)

    re_rule = '":"work-(.*?)"'
    wordId = re.findall(re_rule, htmls)  # 找出所有匹配

    wordId = list(set(wordId))  # 先转集合,再转队列  去重复
    print(wordId)
    title = []
    for x in wordId:
        wordUrl = __returnWorkUrl(courseId, x)
        html_work = __getRequest(wordUrl)
        title_rule = '<div class="Zy_TItle clearfix">\s*<i class="fl">.*</i>\s*<div class=".*">(.*?)</div>'
        title = title + re.findall(title_rule, html_work)  # 找出所有匹配

    re_rule = '<div id="c?o?u?r?s?e?C?h?a?p?t?e?r?S?e?l?e?c?t?e?d?" class="[\s\S]*?" data="(\d*)">?'
    datas = re.findall(re_rule, htmls)  # 找出所有匹配

    return title, datas


def getTextByCourseId(courseId):
    global cou
    # try:
    titles = []
    data_now = __getFristData(courseId)  # 第一个data需要再单独的一个链接里获取
    j = 1
    while data_now:
        listR = __returnTitle(courseId, data_now)
        title = listR[0]
        data = listR[1]
        for i, x in enumerate(data):
            if data_now == x:
                if len(data) > (i + 1):
                    data_now = data[i + 1]
                else:
                    data_now = None
                    print('获取题目结束.')
                break

        # 打印题目  去除题目中的<p></p>获取其他标签,只有部分题目有,可能是尔雅自己整理时候加入的。
        p = re.compile(r'[【](.*?)[】]', re.S)  # 将一个字符串编译为字节代码
        for t in title:
            p_rule = '<.*?>'
            t = re.sub(p_rule, '', t)
            p_rule = '&.*?;'
            t = re.sub(p_rule, '', t)

            r = requests.post(answerUrl + t)  # 封装
            rd = json.loads(r.text)  # 将json str转化成 python object
            k = [j, courseId, re.findall(p, t), t, rd['answer']]
            titles.append(k)  # 在列表末尾添加新的对象
            if cou != courseId:
                write_excel_xls(book_name_xls + courseId + '.xls', courseId, value_title)
                write_excel_xls_append(book_name_xls + courseId + '.xls', k)
                cou = courseId
            else:
                write_excel_xls_append(book_name_xls + courseId + '.xls', k)
            print(k)
            j += 1
    return titles


# except:
# return []


def write_excel_xls(path, sheet_name, value):  # 编写excel
    index = len(value)  # 获取需要写入数据的行数
    workbook = xlwt.Workbook()  # 新建一个工作簿
    sheet = workbook.add_sheet(sheet_name)  # 在工作簿中新建一个表格
    for i in range(0, index):
        for j in range(0, len(value[i])):
            sheet.write(i, j, value[i][j])  # 像表格中写入数据(对应的行和列)
    workbook.save(path)  # 保存工作簿
    print("xls格式表格写入数据成功!")


def write_excel_xls_append(path, value):  # 编写excel附件
    index = len(value)  # 获取需要写入数据的行数
    workbook = xlrd.open_workbook(path)  # 打开工作簿
    sheets = workbook.sheet_names()  # 获取工作簿中的所有表格
    worksheet = workbook.sheet_by_name(sheets[0])  # 获取工作簿中所有表格中的的第一个表格
    rows_old = worksheet.nrows  # 获取表格中已存在的数据的行数
    new_workbook = copy(workbook)  # 将xlrd对象拷贝转化为xlwt对象
    new_worksheet = new_workbook.get_sheet(0)  # 获取转化后工作簿中的第一个表格

    for j in range(0, len(value)):
        new_worksheet.write(rows_old, j, value[j])  # 追加写入数据,注意是从i+rows_old行开始写入
    new_workbook.save(path)  # 保存工作簿
    print("xls格式表格【追加】写入数据成功!")


def read_excel_xls(path):  # 阅读excel
    workbook = xlrd.open_workbook(path)  # 打开工作簿
    sheets = workbook.sheet_names()  # 获取工作簿中的所有表格
    worksheet = workbook.sheet_by_name(sheets[0])  # 获取工作簿中所有表格中的的第一个表格
    for i in range(0, worksheet.nrows):
        for j in range(0, worksheet.ncols):
            print(worksheet.cell_value(i, j), "\t", end="")  # 逐行逐列读取数据


# input('请输入courseId:')  # '200837021'  200080607 = 189题
if __name__ == '__main__':

    # 批量爬取
    # i 为课程ID
    # 提示:因为有些ID并不存在,爬取过程中并不会一直运行下去,如果停了需要手动再次启动
    # 已爬取到的课程后若要从新启动,请更改 i 不然还会再从新爬一次
    # i = 208420011  # 例如要从208422277到40000000
    i = 200837021
    while (i):
        # write_excel_xls(book_name_xls, '1', value_title)

        try:
            courseId = str(i)
            getTextByCourseId(courseId)
        except:
            print(courseId + "数据异常")
            time.sleep(1)  # 暂停 1 秒
        i += 1
        if i > 400000000:
            exit()

成不成功看你运气喽,不过要是不重要的选修网课的话可以试试浏览器油猴子插件
PS:这里插一嘴,别以为选修平常考试都满分,课也都刷完了,期末考试就不重要了,我之前期末考试的时候碰上油猴子插件那段时间正好不好使,我就全选的C提交的,寻思应该不能挂,打不了分低一点,没想到他,他,他,他挂了

最后修改:2021 年 05 月 11 日
如果觉得我的文章对你有用,请随意赞赏