CUMT教务系统模拟登录

List [CTL]

没爬过自己学校教务网站怎么能说自己会敲爬虫 : )

在此记录模拟登录cumt教务系统

Demo

正文

教务系统网址

和许多学校相同，都是正方教务系统（ummm正方和煎蛋难兄难弟）

查看源代码

可以看到由五个js进行登录加密，为RSA加密，不了解rsa的看这里：RSA加密

提交表单

post的数据包括csrf令牌以及明文的yhm（即学号，我随便敲的），和base64加密的mm(提交了两次)，即密码

csrftoken

用来防止跨站请求伪造

源代码中搜索，找到随机生成的token表单value

登录加密

查看login.js

找到获取公钥私钥的地址

cookies问题

使用requests库的requests.session()保持会话

登录逻辑：从登录页面获取csrftoken，请求login_getpublickey.html提交时间参数获取rsa密钥，对获取到的密钥base64解码，用密钥对登录密码进行rsa加密，对密文再进行base64编码，最后post

rsa加密是最麻烦的地方

看了教务系统的base64编码js，发现编码方式为hex串。由于使用python标准库中的base64会将hex串转为字节，而这里的RSA密钥则是需要完整的hex字符串，例如标准库中a0 => YTA=，而我需要a0 => oA==，即将a0看作一个字节的hex值进行编码。

故写了个base64 => hex的算法，其实图方便可以直接把base64.js改写为python版，但我自己写的原因是：…那个算法我没看懂

class HB64(object):

    b64byte = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
    b64cpt = "="

    def hex2b64(self, string):
        result = ""
        ptr = 0
        b1 = int("111111000000000000000000", 2)
        b2 = int("000000111111000000000000", 2)
        b3 = int("000000000000111111000000", 2)
        b4 = int("000000000000000000111111", 2)
        lenth = len(string)
        while ptr+6 <= lenth:
            temp = int(string[ptr:ptr+6], 16)
            result += self.b64byte[(temp & b1) >> 18] 
            result += self.b64byte[(temp & b2) >> 12]
            result += self.b64byte[(temp & b3) >> 6]
            result += self.b64byte[temp & b4]
            ptr += 6
        if lenth-ptr == 4:
            temp = int(string[ptr:ptr+4], 16) << 2
            result += self.b64byte[(temp & b2) >> 12]
            result += self.b64byte[(temp & b3) >> 6]
            result += self.b64byte[temp & b4]
            result += self.b64cpt
        elif lenth-ptr == 2:
            temp = int(string[ptr:ptr+2], 16) << 4
            result += self.b64byte[(temp & b3) >> 6]
            result += self.b64byte[temp & b4]
            result += self.b64cpt * 2
        elif lenth-ptr == 0:
            pass
        else:
            raise Exception
        return result

    def b642hex(self, string):
        result = ""
        ptr = 0
        lenth = len(string)
        b1 = int("111111110000000000000000", 2)
        b2 = int("000000001111111100000000", 2)
        b3 = int("000000000000000011111111", 2)
        while ptr+8 <= lenth:
                temp = string[ptr:ptr+4]
                temp_result = 0
                for cell in range(4):
                    temp_result += self.b64byte.index(temp[cell]) << (6 * (3 - cell))
                r1 = hex((temp_result & b1) >> 16)[2:]
                r2 = hex((temp_result & b2) >> 8)[2:]
                r3 = hex(temp_result & b3)[2:]
                if len(r1) == 1:
                    r1 = '0' + r1
                if len(r2) == 1:
                    r2 = '0' + r2
                if len(r3) == 1:
                    r3 = '0' + r3
                result += r1
                result += r2
                result += r3
                ptr += 4
        if string[-1]=="=" and string[-2]=="=":
            temp = string[ptr:ptr+2]
            temp_result = 0
            temp_result += self.b64byte.index(temp[0]) << 18
            temp_result += self.b64byte.index(temp[1] >> 4) << 12
            r1 = hex((temp_result & b1) >> 16)[2:]
            r2 = hex((temp_result & b2) >> 8)[2:]
            if len(r1) == 1:
                r1 = '0' + r1
            if len(r2) == 1:
                r2 = '0' + r2
            result += r1
            result += r2

        elif string[-1]=="=":
            temp = string[ptr:ptr+3]
            temp_result = 0
            for cell in range(2):
                temp_result += self.b64byte.index(temp[cell]) << (6 * (3 - cell))
            temp_result += self.b64byte.index(temp[2] >> 2) << 6
            r1 = hex((temp_result & b1) >> 16)[2:]
            r2 = hex((temp_result & b2) >> 8)[2:]
            r3 = hex(temp_result & b3)[2:]
            if len(r1) == 1:
                r1 = '0' + r1
            if len(r2) == 1:
                r2 = '0' + r2
            if len(r3) == 1:
                r3 = '0' + r3
            result += r1
            result += r2
            result += r3
        elif "=" not in string:
            temp = string[ptr:ptr+4]
            temp_result = 0
            for cell in range(4):
                temp_result += self.b64byte.index(temp[cell]) << (6 * (3 - cell))
            r1 = hex((temp_result & b1) >> 16)[2:]
            r2 = hex((temp_result & b2) >> 8)[2:]
            r3 = hex(temp_result & b3)[2:]
            if len(r1) == 1:
                r1 = '0' + r1
            if len(r2) == 1:
                r2 = '0' + r2
            if len(r3) == 1:
                r3 = '0' + r3
            result += r1
            result += r2
            result += r3
        else:
            raise Exception
        return result

RSA加密是JS中的jsbn进行大数字运算的特定加密，有setPublicKey方法，一样不同于python标准库，见此

参考stackoverflow文章戳我，用了github上别人写的JS原生RSA加密的python版程序

代码

class httpmthd():
    sessions = requests.session()
    time = int(time.time())

    def __init__(self,user,passwd):                       
        self.user = str(user).encode("utf8").decode("utf8")
        self.passwd = str(passwd).encode("utf8").decode("utf8")

    def get_public(self):                       #获得rsa公钥json保存在pub字典中
        url = 'http://202.119.206.62/jwglxt/xtgl/login_getPublicKey.html?time='+str(self.time)
        r = self.sessions.get(url)
        self.pub = r.json()

    def get_csrftoken(self):                    #提取token
        url = 'http://202.119.206.62/jwglxt/xtgl/login_slogin.html?language=zh_CN&_t='+str(self.time)
        r = self.sessions.get(url)
        r.encoding = r.apparent_encoding
        soup = BeautifulSoup(r.text,'html.parser')
        self.token = soup.find('input',attrs={'id':'csrftoken'}).attrs['value']

    def process_public(self,str):               #处理密码,rsa加密
        a = HB64()
        self.exponent = a.b642hex(self.pub['exponent'])           #将json中的base64加密公钥解密
        self.modulus = a.b642hex(self.pub['modulus'])
        rsa = RSAJS.RSAKey()
        rsa.setPublic(self.modulus, self.exponent)                          #rsa加密
        cry_data = rsa.encrypt(str)
        return a.hex2b64(cry_data)                                #加密后的数据进行base64加密

    def post_data(self):                        #post数据
        try:
            url = 'http://202.119.206.62/jwglxt/xtgl/login_slogin.html'
            header = {
                'Accept':'text/html,application/xhtml+xm…plication/xml;q=0.9,*/*;q=0.8',	
                'Accept-Encoding':'gzip, deflate',
                'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
                'Connection':'keep-alive',
                'Content-Length':'470',
                'Content-Type':'application/x-www-form-urlencoded',
                'Host':'202.119.206.62',
                'Referer':'http://202.119.206.62/jwglxt/xtgl/login_slogin.html?language=zh_CN&_t='+str(self.time),
                'Upgrade-Insecure-Requests':'1',
                'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0',	
            }
            self.header = header 
            data = [
                ('csrftoken',self.token),
                ('mm',self.process_public(self.passwd)),             #对密码进行加密
                ('mm',self.process_public(self.passwd)),             #post的data数据有两个相同mm字段
                ('yhm',self.user)
            ]
            self.req = self.sessions.post(url,headers = header,data = data)
            ppot = r'用户名或密码不正确'
            if re.findall(ppot,self.req.text):
                print('用户名或密码错误,请查验..')
                time.sleep(2)
                exit()
        except:
            print('登录失败,请检查网络配置或检查账号密码...')
            time.sleep(1)
            exit()