您的当前位置:首页>全部文章>文章详情

python之js逆向爬虫实战

发表于:2024-07-17 21:51:51浏览:251次TAG: #python #js #爬虫 #实战 #js拦截

引言

这是一个通过JS拦截来实现爬虫的实战

环境

技术点:js拦截
模块:playwright

# 安装playwright模块(需梯子)
pip install playwright

# 安装完成后执行
playwright install

图片alt

图片alt

思路

  • 找出调用接口的地方 > 找到加密的参数 > 找到调用加密参数是JS文件,假设为demo.js
  • 把demo.js复制到本地,修改本地JS代码,把调用加密的JS方法赋值给全局变量window
  • 在python代码中通过route对JS进行拦截,原本应调用远程JS拦截,改成调用本地JS
  • 这时候就能通过调用窗口对象window来获取得到值

代码

  • demo.py
from playwright.sync_api import sync_playwright
import requests

class Spider:
    """
    采集scrape
    """
    def __init__(self) -> None:
        self.max_page = 1
        self.page_url = "https://spa2.scrape.center/"
        self.api_url = "https://spa2.scrape.center/"
        self.limit = 10
        self.__new_page()
        pass

    def start(self):
        """
        入口
        """
        headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
        }
        for x in range(self.max_page):
            offset = x*self.limit
            token = self.__get_token(offset)
            print(token)
            response = requests.get(f"{self.api_url}api/movie/?limit={self.limit}&offset={offset}&token={token}",headers=headers)
            if response.status_code == 200:
                print(response.text)
            else:
                print("请求失败")

        pass

    def __new_page(self):
        """
        创建一个页面
        """
        # 创建一个playwright对象
        context = sync_playwright().start()
        # 通过playwright对象创建一个浏览器对象
        browser = context.chromium.launch()
        # 通过浏览器对象创建一个页面对象
        page = browser.new_page()
        # 进行拦截
        page.route("https://spa2.scrape.center/js/chunk-10192a00.243cb8b7.js",lambda route:route.fulfill(path="./demo.js"))
        # 页面的链接
        page.goto(self.page_url)
        self.page = page

    def __get_token(self,offset):
        """
        获取TOKEN
        """
        return self.page.evaluate(f"()=>{{return window.encrypt('/api/movie','{offset}')}}")

if __name__ == "__main__":
    spider = Spider()
    spider.start()
  • demo.js
// 从该JS链接,复制内容下来 https://spa2.scrape.center/js/chunk-10192a00.243cb8b7.js

(window["webpackJsonp"] = window["webpackJsonp"] || []).push([["chunk-10192a00"], {
    "5a19": function(t, a, e) {},
    "79ee": function(t, a, e) {},
    ca9c: function(t, a, e) {
        "use strict";
        var s = e("5a19")
          , n = e.n(s);
        n.a
    },
    d504: function(t, a, e) {
        "use strict";
        e.r(a);
        var s = function() {
            var t = this
              , a = t.$createElement
              , e = t._self._c || a;
            return e("div", {
                attrs: {
                    id: "index"
                }
            }, [e("el-row", {
                directives: [{
                    name: "loading",
                    rawName: "v-loading",
                    value: t.loading,
                    expression: "loading"
                }]
            }, [e("el-col", {
                attrs: {
                    span: 18,
                    offset: 3
                }
            }, t._l(t.movies, (function(a) {
                return e("el-card", {
                    key: a.name,
                    staticClass: "item m-t",
                    attrs: {
                        shadow: "hover"
                    }
                }, [e("el-row", [e("el-col", {
                    attrs: {
                        xs: 8,
                        sm: 6,
                        md: 4
                    }
                }, [e("router-link", {
                    attrs: {
                        to: {
                            name: "detail",
                            params: {
                                key: t.transfer(a.id)
                            }
                        }
                    }
                }, [e("img", {
                    staticClass: "cover",
                    attrs: {
                        src: a.cover
                    }
                })])], 1), e("el-col", {
                    staticClass: "p-h",
                    attrs: {
                        xs: 9,
                        sm: 13,
                        md: 16
                    }
                }, [e("router-link", {
                    staticClass: "name",
                    attrs: {
                        to: {
                            name: "detail",
                            params: {
                                key: t.transfer(a.id)
                            }
                        }
                    }
                }, [e("h2", {
                    staticClass: "m-b-sm"
                }, [t._v(t._s(a.name) + " - " + t._s(a.alias))])]), e("div", {
                    staticClass: "categories"
                }, t._l(a.categories, (function(a) {
                    return e("el-button", {
                        key: a,
                        staticClass: "category",
                        attrs: {
                            size: "mini",
                            type: "primary"
                        }
                    }, [t._v(t._s(a) + "\n              ")])
                }
                )), 1), e("div", {
                    staticClass: "m-v-sm info"
                }, [e("span", [t._v(t._s(a.regions.join("、")))]), e("span", [t._v(" / ")]), e("span", [t._v(t._s(a.minute) + " 分钟")])]), e("div", {
                    staticClass: "m-v-sm info"
                }, [e("span", [t._v(t._s(a.published_at) + " 上映")])])], 1), e("el-col", {
                    attrs: {
                        xs: 5,
                        sm: 5,
                        md: 4
                    }
                }, [e("p", {
                    staticClass: "score m-t-md m-b-n-sm"
                }, [t._v(t._s(a.score.toFixed(1)))]), e("p", [e("el-rate", {
                    attrs: {
                        value: a.score / 2,
                        disabled: "",
                        max: 5,
                        "text-color": "#ff9900"
                    }
                })], 1)])], 1)], 1)
            }
            )), 1)], 1), e("el-row", [e("el-col", {
                attrs: {
                    span: 10,
                    offset: 11
                }
            }, [e("div", {
                staticClass: "pagination m-v-lg"
            }, [e("el-pagination", {
                attrs: {
                    background: "",
                    "current-page": t.page,
                    "page-size": t.limit,
                    layout: "total, prev, pager, next",
                    total: t.total
                },
                on: {
                    "current-change": t.onPageChange,
                    "update:currentPage": function(a) {
                        t.page = a
                    },
                    "update:current-page": function(a) {
                        t.page = a
                    }
                }
            })], 1)])], 1)], 1)
        }
          , n = []
          , i = e("7d92")
          , r = e("3e22")
          , o = {
            name: "Index",
            components: {},
            data: function() {
                return {
                    loading: !1,
                    total: null,
                    page: parseInt(this.$route.params.page || 1),
                    limit: 10,
                    movies: null
                }
            },
            mounted: function() {
                this.onFetchData()
            },
            methods: {
                transfer: r["a"],
                onPageChange: function(t) {
                    this.$router.push({
                        name: "indexPage",
                        params: {
                            page: t
                        }
                    }),
                    this.onFetchData()
                },
                onFetchData: function() {
                    var t = this;
                    this.loading = !0;
                    var a = (this.page - 1) * this.limit
                      , e = Object(i["a"])(this.$store.state.url.index, a);
                      /**
                       * 这里用到了JS拦截
                       * 说明:Object(i["a"])是一个方法名
                       * 1、定义个全局方法 window.encrypt ,用于绑定Object(i["a"]) 
                       *    调用 window.encrypt 这个方法相当于调用 Object(i["a"]) 
                       * 2、window.encrypt 这个方法,我们在python中去调用就可以获取得到值
                       */
                      window.encrypt =  Object(i["a"])
                    this.$axios.get(this.$store.state.url.index, {
                        params: {
                            limit: this.limit,
                            offset: a,
                            token: e
                        }
                    }).then((function(a) {
                        var e = a.data
                          , s = e.results
                          , n = e.count;
                        t.loading = !1,
                        t.movies = s,
                        t.total = n
                    }
                    ))
                }
            }
        }
          , l = o
          , c = (e("ca9c"),
        e("e93d"),
        e("2877"))
          , u = Object(c["a"])(l, s, n, !1, null, "8a85e5c6", null);
        a["default"] = u.exports
    },
    e93d: function(t, a, e) {
        "use strict";
        var s = e("79ee")
          , n = e.n(s);
        n.a
    }
}]);