写一个伪登录学校官网的爬虫

工具

Kali Linux（无需配环境，并且KDE桌面很好看）

Pycharm（开发工具）

本期博客仅做学习使用！出现任何问题与本人均无任何关系

测试阶段

打开burpsuite，开启抓包模式，通过chrome内核浏览器访问学校官网，利用burpsuite抓包测试发现，登录过程主要有以下流程：

登录校验cookie(如果有ST通行证则直接跳转网页)
如果没有cookie可以通过登录获取ST-通行证
跳转网页

查看网页cookie可以发现ST通行证保存在cookie里，初步猜测可能是渗透的弱点。

爬取测试

打开网页和burpsuite发现自动弹出一个包。

查阅源代码可以发现数据内容是时间戳。

点击输入框校验是否填写完整，可以不用管。随便输入什么然后改包试试。

观察post值可以发现主要为username和password的值，其中的lt为按钮的值，因为是post协议初步猜测是在一个form内的值：

打开开发者模式就可以发现有这几个在表单内的隐藏属性。

如果按顺序组装参数，利用post方式发送请求到指定网页的话：

成功登录会获取如下的值。保存下来

# 第一版本代码
import requests as rq
from bs4 import BeautifulSoup as bs

usernames = "用户名"
passwords = "密码"

user_agent = "Mozilla/5.0 (Linux; Android 10; The Fucker) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36"


def buildPayLoad(username, password):
    session = rq.Session()
    url = "http://authserver.hlju.edu.cn/authserver/login"
    html_page = session.request("GET", url, allow_redirects=True).text
    soup = bs(html_page, "html.parser")

    safeURL = "http://authserver.hlju.edu.cn" + soup.find("form", {"id": "casLoginForm"})["action"]
    lt = soup.find("input", {"name": "lt"})["value"]
    execution = soup.find("input", {"name": "execution"})["value"]
    eventID = soup.find("input", {"name": "_eventId"})["value"]
    rmShown = soup.find("input", {"name": "rmShown"})["value"]

    return safeURL, {"username": username, "password": password, "lt": lt, "execution": execution, "_eventId": eventID, "rmShown": rmShown}

def login(urls, payload):
    session = rq.Session()
    res = session.post(urls, data=payload, allow_redirects=True)
    return res.request.headers.get("Cookie")


def logout(hlju_cookie):
    session = rq.Session()
    url = "http://authserver.hlju.edu.cn/authserver/logout?service=http://authserver.hlju.edu.cn/authserver/login"
    headers = {"Cookie": hlju_cookie}
    res = session.get(url, headers=headers, allow_redirects=True)
    return res.status_code

if __name__ == "__main__":
    urls, payload = buildPayLoad(usernames, passwords)
    hlju_cookie = login(urls, payload)
    print(hlju_cookie)

测试发现：

代码可以成功获取到CASTGC和iPlanetDirectoryPro，一旦账号设置错误，将不会有这两个值。但是多次测试就会发现有的时候会获取不到这两个值，原因可能是没有注销或者网络延迟，所以这里可以加上循环进行获取。

def login(urls, payload):
    session = rq.Session()
    res = session.post(urls, data=payload, allow_redirects=True, params=user_agent)
    while res.request.headers.get("Cookie").find("CASTGC") == -1:
        res = session.post(urls, data=payload, allow_redirects=True, params=user_agent)
    return res.request.headers.get("Cookie")

但是尽管如此也不能每次都能成功登录，我忽然想起，可以把cookie清除掉再次尝试登录：

def login(urls, payload):
    session = rq.Session()
    res = session.post(urls, data=payload, allow_redirects=True)
    while res.request.headers.get("Cookie").find("CASTGC") == -1:
        # 清楚cookie
        session.cookies.clear()
        res = session.post(urls, data=payload, allow_redirects=True)
    return res.request.headers.get("Cookie")

经过多次测试发现，这样可以一定登录成功了！

继续抓包，发现登出是向authserver/logout?service=登录页进行发送请求，cookie为之前获取到的，所以我们就可以模拟登出。

def logout(cookie):
    ur = "http://authserver.hlju.edu.cn/authserver/logout?service=http://authserver.hlju.edu.cn/authserver/login"
    session = rq.Session()
    res = session.get(ur, allow_redirects=True, headers={"Cookie": cookie})
    return res.request.headers.get("Cookie") is None

一旦登出，cookie就会被销毁，然后回到登录页面。这样其实这个网站就已经被拿下了。

我发现，一旦多次登录后就会出现验证码，并且是服务端进行发送的请求，观察验证码的图片，发现可以通过OCR识别进行校验，用一点计算机视觉就可以了！

参数是captchaResponse，经过测试发现验证码会附带一个参数ts，也就是微秒，同时发现，验证码的数值可以为最后一次访问captcha.html的值，所以就可以写出如下内容。

import cv2
import numpy as np
import pytesseract
from PIL import Image

# 处理图片
def treat(img, threshold=127):
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # 二值
    ret, img = cv2.threshold(img, threshold, 255, cv2.THRESH_BINARY)
    kernel = np.ones((2, 2), np.uint8)
    # 腐蚀
    img = cv2.erode(img, kernel, iterations=1)
    # 膨胀
    img = cv2.dilate(img, kernel, iterations=1)
    # 高斯
    img = cv2.medianBlur(img, 3)
    return img

# 识别验证码
def recognize_code(image):
    # 识别文字
    code = pytesseract.image_to_string(image, lang="eng")
    result = ''.join(list(filter(str.isalnum, code)))
    return result.strip()

# 判断是否需要验证码
def needCaptcha():
    url = "https://authserver.hlju.edu.cn/authserver/needCaptcha.html?username=" + usernames
    # 发送请求
    session = rq.Session()
    re = session.get(url, allow_redirects=True, headers={"user-agent": user_agent})
    re.cookies.clear()
    if re.text == "false":
        return None
    else:
        img = session.get("https://authserver.hlju.edu.cn/authserver/captcha.html").content
        with open("img.png", 'wb') as f:
            f.write(img)

        ig = treat(cv2.imread("img.png"))
        cv2.imwrite("img.png", ig)

        fileimg = Image.open('img.png')
        return recognize_code(fileimg)

最后开源一下代码：

import requests as rq
from bs4 import BeautifulSoup as bs
import pytesseract
from PIL import Image
import cv2
import numpy as np

usernames = "学号"
passwords = "密码"

def treat(img, threshold=127):
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # 二值化
    ret, img = cv2.threshold(img, threshold, 255, cv2.THRESH_BINARY)
    # 去除噪点
    kernel = np.ones((2, 2), np.uint8)
    img = cv2.erode(img, kernel, iterations=1)
    img = cv2.dilate(img, kernel, iterations=1)
    # 降噪
    img = cv2.medianBlur(img, 3)
    return img



# OCR识别验证码
def recognize_code(image):
    # 去除噪点
    image = image.convert('L')
    table = []
    threshold = 150
    for i in range(256):
        if i < threshold:
            table.append(0)
        else:
            table.append(1)
    image = image.point(table, '1')

    # 识别文字
    code = pytesseract.image_to_string(image, lang="eng")
    result = ''.join(list(filter(str.isalnum, code)))
    return result.strip()

def needCaptcha():
    url = "https://authserver.hlju.edu.cn/authserver/needCaptcha.html?username=" + usernames
    # 发送请求
    session = rq.Session()
    re = session.get(url, allow_redirects=True)
    re.cookies.clear()
    if re.text.find("false") == 0:
        return None
    else:
        img = session.get("https://authserver.hlju.edu.cn/authserver/captcha.html").content
        with open("img.png", 'wb') as f:
            f.write(img)

        ig = treat(cv2.imread("img.png"))
        cv2.imwrite("img.png", ig)

        fileimg = Image.open('img.png')
        return recognize_code(fileimg)


def buildPayLoad(username, password):
    code = needCaptcha()
    session = rq.Session()
    url = "http://authserver.hlju.edu.cn/authserver/login"
    html_page = session.request("get", url, allow_redirects=True).text
    soup = bs(html_page, "html.parser")
    safeURL = "http://authserver.hlju.edu.cn" + soup.find("form", {"id": "casLoginForm"})["action"]
    lt = soup.find("input", {"name": "lt"})["value"]
    execution = soup.find("input", {"name": "execution"})["value"]
    eventID = soup.find("input", {"name": "_eventId"})["value"]
    rmShown = soup.find("input", {"name": "rmShown"})["value"]

    if code is not None:
        return safeURL, {"username": username, "password": password, "captchaResponse" : code, "lt": lt, "execution": execution, "_eventId": eventID, "rmShown": rmShown}
    else:
        return safeURL, {"username": username, "password": password, "lt": lt, "execution": execution, "_eventId": eventID, "rmShown": rmShown}

def buildCookie(cookie):
    castgc = hlju_cookie.split(";")[0].split("=")[1]
    route = hlju_cookie.split(";")[1].split("=")[1]
    iPlanetDirectoryPro = hlju_cookie.split(";")[2].split("=")[1]
    JSSESSIONID = hlju_cookie.split(";")[3].split("=")[1]
    return {"CASTGC": castgc, "route": route, "iPlanetDirectoryPro": iPlanetDirectoryPro, "JSSESSIONID": JSSESSIONID}


def login(urls, payload):
    session = rq.Session()
    res = session.post(urls, data=payload, allow_redirects=True)
    while res.request.headers.get("Cookie").find("CASTGC") == -1:
        session.cookies.clear()
        res = session.post(urls, data=payload, allow_redirects=True)
        # 更新cookie
    return res.request.headers.get("Cookie")


# True为登出成功
def logout(cookie):
    ur = "http://authserver.hlju.edu.cn/authserver/logout?service=http://authserver.hlju.edu.cn/authserver/login"
    session = rq.Session()
    res = session.get(ur, allow_redirects=True, headers={"Cookie": cookie})
    return res.request.headers.get("Cookie") is None


if __name__ == "__main__":
    urls, payload = buildPayLoad(usernames, passwords)

    hlju_cookie = login(urls, payload)

    bd_cookie = buildCookie(hlju_cookie)
    # bd_cookie就是最终登录返回的值

    if logout(hlju_cookie):
        print("登出成功")

总结

本次爬虫花了我5个小时，难度不大，但是很得劲，有种使用大脑的感觉，很爽的嘞~

技术

#python

写一个伪登录学校官网的爬虫

https://blog.minloha.cn/posts/152100121c742d2023122130.html

作者

Minloha

发布于

2023年12月21日

更新于

2023年12月21日

许可协议

[1]arm64架构学习上一篇

浅谈目标识别与运算加速下一篇