0x00 介绍

由于本人兴趣所致,一直对数据分析有极大的兴趣,所以也会进行相关方面的东西的一些尝试。

既然是数据分析第一步自然是抓取数据,既然做了自然是希望这个东西有一定的价值,所以选择了twitter和微博来练手。所以也就有了爬微博数据的需求。

后续也会将自己做的一些东西和一些思考按照进度放在博客上,做个记录。

0x01 问题

微博的站点一共有三个,分别是:

    1. weibo.cn
    1. m.weibo.cn
    1. weibo.com

页面的复杂度也是从低到高,所以为了爬取数据更为的简单,肯定是首选 weibo.cn。但是weibo有个很蛋疼的问题,就是当你要抓取一些页面的时候需要登陆,而这个登陆有个滑动验证码如图所示,本文便是解决该验证码问题的。

0x02 思路

    1. 该图形为四宫格,所以可能的组合只有 432*1 种。
    1. 可以找出这24种验证码的图片。
    1. 图片特别干净没有噪点。
    1. 将图片灰度化后与收集的24种不同的图片进行类比即可知道路线。
    1. pyppeteer 模拟鼠标滑动进行登陆。

0x03 代码

    1. 获取验证码图片
async def MainLogin():
    browser = await launch(executablePath=ChromePath, headless=False, args=['--proxy-server=socks5://127.0.0.1:1080'])
    page = await browser.newPage()
    await page.goto(WeiboLoginUri, {"waitUntil": "documentloaded"})

    # 进入页面停滞两秒,先发个呆
    time.sleep(2)
    # 填充表格
    await page.type(selector="#loginName", text="yourLoginname", options={"delay": 50})
    await page.type(selector="input#loginPassword", text="yourPassword", options={"delay": 50})

    # 选择按钮
    Button = await page.querySelector("#loginAction")
    await Button.click()
    # 等待图形验证码加载
    await page.waitForSelector("#patternCaptchaHolder > div.patt-holder-body")
    # 等待动画放映完成
    time.sleep(2.5)

    # 获取图形验证码位置坐标
    CapturePic = await page.querySelector("#patternCaptchaHolder > div.patt-holder-body > div.patt-shadow")
    CapturePicPoint = await CapturePic.boundingBox()

    # 截取图形验证码
    await page.screenshot({"path": "4.png", "clip": CapturePicPoint})
    1. 通过循环获取图片,然后提取出24种不同的图片

    1. 提取这24张图片每个像素点的灰度值。
def getIMSpy():
    ims = {}
    Path = "CapturePicAll"

    for fingerPicFile in os.listdir("CapturePicAll"):

        keyName = fingerPicFile.split('.')[0]
        PicABSPath = Path + os.sep + fingerPicFile
        openPic = Image.open(PicABSPath).convert('L')
        width, heigth = openPic.size

        ims[keyName] = []

        for i in range(width):
            rowResut = []
            for j in range(heigth):
                rowResut.append(openPic.load()[i,j])

            ims[keyName].append(rowResut)

    f = open("ims.py",'a')
    f.write("ims = ")
    f.write(str(ims))
    f.close()

这样就可以得到一个ims.py文件,里面包含每个路径图片的特征。

    1. 获取登陆验证码图片与ims.py中路径中的每个特征进行对比,可得到路径。
def AalysisPic(FilePath):
    result = ""
    SourcePic = Image.open(FilePath).convert('L')
    width, height = SourcePic.size

    for MoveWay in ims.keys():
        isGoingOn = True
        for i in range(width):
            for j in range(height):
                # 以245为临界值,大约245为空白,小于245为线条;两个像素之间的差大约10,是为了去除245边界上的误差
                if (
                        (SourcePic.load()[i, j] >= 245 and ims[MoveWay][i][j] < 245) or
                        (SourcePic.load()[i, j] < 245 and ims[MoveWay][i][j] >= 245)
                ) and \
                        abs(ims[MoveWay][i][j] - SourcePic.load()[i, j]) > 10:
                    isGoingOn = False
                    break
            if isGoingOn is False:
                result = ''
                break
            else:
                result = MoveWay
        else:
            break

    return result


async def MainLogin():
    browser = await launch(executablePath=ChromePath, headless=False, args=['--proxy-server=socks5://127.0.0.1:1080'])
    page = await browser.newPage()
    await page.goto(WeiboLoginUri, {"waitUntil": "documentloaded"})

    # 进入页面停滞两秒,先发个呆
    time.sleep(2)
    # 填充表格
    await page.type(selector="#loginName", text="yourLoginname", options={"delay": 50})
    await page.type(selector="input#loginPassword", text="yourPassword", options={"delay": 50})

    # 选择按钮
    Button = await page.querySelector("#loginAction")
    await Button.click()
    # 等待图形验证码加载
    await page.waitForSelector("#patternCaptchaHolder > div.patt-holder-body")
    # 等待动画放映完成
    time.sleep(2.5)

    # 获取图形验证码位置坐标
    CapturePic = await page.querySelector("#patternCaptchaHolder > div.patt-holder-body > div.patt-shadow")
    CapturePicPoint = await CapturePic.boundingBox()

    # 截取图形验证码
    await page.screenshot({"path": "4.png", "clip": CapturePicPoint})
    MoveWay = AalysisPic("4.png")
    print(MoveWay)
    1. 页面定位到验证码的位置,模拟滑动,定位后需要加个随机值,防止位置一直相同。需要将连线的step调高,看起来更像人手动滑动的。
import time
import random
import asyncio
from pyppeteer import launch

from PIL import Image

from ims import ims

WeiboLoginUri = "https://passport.weibo.cn/signin/login?entry=mweibo&r=https://weibo.cn"
ChromePath = "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe"


def AalysisPic(FilePath):
    result = ""
    SourcePic = Image.open(FilePath).convert('L')
    width, height = SourcePic.size

    for MoveWay in ims.keys():
        isGoingOn = True
        for i in range(width):
            for j in range(height):
                # 以245为临界值,大约245为空白,小于245为线条;两个像素之间的差大约10,是为了去除245边界上的误差
                if (
                        (SourcePic.load()[i, j] >= 245 and ims[MoveWay][i][j] < 245) or
                        (SourcePic.load()[i, j] < 245 and ims[MoveWay][i][j] >= 245)
                ) and \
                        abs(ims[MoveWay][i][j] - SourcePic.load()[i, j]) > 10:
                    isGoingOn = False
                    break
            if isGoingOn is False:
                result = ''
                break
            else:
                result = MoveWay
        else:
            break

    return result


async def MainLogin():
    browser = await launch(executablePath=ChromePath, headless=False, args=['--proxy-server=socks5://127.0.0.1:1080'])
    page = await browser.newPage()
    await page.goto(WeiboLoginUri, {"waitUntil": "documentloaded"})

    # 进入页面停滞两秒,先发个呆
    time.sleep(2)
    # 填充表格
    await page.type(selector="#loginName", text="yourLoginname", options={"delay": 50})
    await page.type(selector="input#loginPassword", text="yourPassword", options={"delay": 50})

    # 选择按钮
    Button = await page.querySelector("#loginAction")
    await Button.click()
    # 等待图形验证码加载
    await page.waitForSelector("#patternCaptchaHolder > div.patt-holder-body")
    # 等待动画放映完成
    time.sleep(2.5)

    # 获取图形验证码位置坐标
    CapturePic = await page.querySelector("#patternCaptchaHolder > div.patt-holder-body > div.patt-shadow")
    CapturePicPoint = await CapturePic.boundingBox()

    # 截取图形验证码
    await page.screenshot({"path": "4.png", "clip": CapturePicPoint})
    MoveWay = AalysisPic("4.png")

    # 获取四个点的位置
    Point_1 = {"x": CapturePicPoint["x"] + 32 * 1 + random.randint(0, 40) - 20,
               "y": CapturePicPoint["y"] + 32 * 1 + random.randint(0, 40) - 20}
    Point_2 = {"x": CapturePicPoint["x"] + 32 * 4 + random.randint(0, 40) - 20,
               "y": CapturePicPoint["y"] + 32 * 1 + random.randint(0, 40) - 20}
    Point_3 = {"x": CapturePicPoint["x"] + 32 * 1 + random.randint(0, 40) - 20,
               "y": CapturePicPoint["y"] + 32 * 4 + random.randint(0, 40) - 20}
    Point_4 = {"x": CapturePicPoint["x"] + 32 * 4 + random.randint(0, 40) - 20,
               "y": CapturePicPoint["y"] + 32 * 4 + random.randint(0, 40) - 20}

    MoveWay = list(MoveWay)

    time.sleep(1)
    for PointNum in range(len(MoveWay)):
        if PointNum == 0:
            if MoveWay[PointNum] == "1":
                await page.mouse.move(Point_1['x'], Point_1['y'], {"steps": 70})
            elif MoveWay[PointNum] == "2":
                await page.mouse.move(Point_2['x'], Point_2['y'], {"steps": 70})
            elif MoveWay[PointNum] == "3":
                await page.mouse.move(Point_3['x'], Point_3['y'], {"steps": 60})
            elif MoveWay[PointNum] == "4":
                await page.mouse.move(Point_4['x'], Point_4['y'], {"steps": 70})

            await page.mouse.down()

        elif PointNum == 3:
            if MoveWay[PointNum] == "1":
                await page.mouse.move(Point_1['x'], Point_1['y'], {"steps": 70})
            elif MoveWay[PointNum] == "2":
                await page.mouse.move(Point_2['x'], Point_2['y'], {"steps": 80})
            elif MoveWay[PointNum] == "3":
                await page.mouse.move(Point_3['x'], Point_3['y'], {"steps": 70})
            elif MoveWay[PointNum] == "4":
                await page.mouse.move(Point_4['x'], Point_4['y'], {"steps": 70})

            await page.mouse.up()

        else:
            if MoveWay[PointNum] == "1":
                await page.mouse.move(Point_1['x'], Point_1['y'], {"steps": 60})
            elif MoveWay[PointNum] == "2":
                await page.mouse.move(Point_2['x'], Point_2['y'], {"steps": 60})
            elif MoveWay[PointNum] == "3":
                await page.mouse.move(Point_3['x'], Point_3['y'], {"steps": 60})
            elif MoveWay[PointNum] == "4":
                await page.mouse.move(Point_4['x'], Point_4['y'], {"steps": 60})

    time.sleep(20)

    await browser.close()


if __name__ == "__main__":
    asyncio.get_event_loop().run_until_complete(MainLogin())

0x06

下一步准备做的是设计下weibo爬虫的分布式,想每天的数据量能有2千万。当然更想拿到twitter的数据。
可能还会学习下nlp相关的知识(虽然已经在看了),数据挖掘(有一定尝试),以支撑我想完成的东西。