python之js逆向爬虫实战
发表于:2024-07-17 21:51:51浏览:251次
引言
这是一个通过JS拦截来实现爬虫的实战
环境
技术点:js拦截
模块:playwright
# 安装playwright模块(需梯子)
pip install playwright
# 安装完成后执行
playwright install
思路
- 找出调用接口的地方 > 找到加密的参数 > 找到调用加密参数是JS文件,假设为demo.js
- 把demo.js复制到本地,修改本地JS代码,把调用加密的JS方法赋值给全局变量window
- 在python代码中通过route对JS进行拦截,原本应调用远程JS拦截,改成调用本地JS
- 这时候就能通过调用窗口对象window来获取得到值
代码
- demo.py
from playwright.sync_api import sync_playwright
import requests
class Spider:
"""
采集scrape
"""
def __init__(self) -> None:
self.max_page = 1
self.page_url = "https://spa2.scrape.center/"
self.api_url = "https://spa2.scrape.center/"
self.limit = 10
self.__new_page()
pass
def start(self):
"""
入口
"""
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
}
for x in range(self.max_page):
offset = x*self.limit
token = self.__get_token(offset)
print(token)
response = requests.get(f"{self.api_url}api/movie/?limit={self.limit}&offset={offset}&token={token}",headers=headers)
if response.status_code == 200:
print(response.text)
else:
print("请求失败")
pass
def __new_page(self):
"""
创建一个页面
"""
# 创建一个playwright对象
context = sync_playwright().start()
# 通过playwright对象创建一个浏览器对象
browser = context.chromium.launch()
# 通过浏览器对象创建一个页面对象
page = browser.new_page()
# 进行拦截
page.route("https://spa2.scrape.center/js/chunk-10192a00.243cb8b7.js",lambda route:route.fulfill(path="./demo.js"))
# 页面的链接
page.goto(self.page_url)
self.page = page
def __get_token(self,offset):
"""
获取TOKEN
"""
return self.page.evaluate(f"()=>{{return window.encrypt('/api/movie','{offset}')}}")
if __name__ == "__main__":
spider = Spider()
spider.start()
- demo.js
// 从该JS链接,复制内容下来 https://spa2.scrape.center/js/chunk-10192a00.243cb8b7.js
(window["webpackJsonp"] = window["webpackJsonp"] || []).push([["chunk-10192a00"], {
"5a19": function(t, a, e) {},
"79ee": function(t, a, e) {},
ca9c: function(t, a, e) {
"use strict";
var s = e("5a19")
, n = e.n(s);
n.a
},
d504: function(t, a, e) {
"use strict";
e.r(a);
var s = function() {
var t = this
, a = t.$createElement
, e = t._self._c || a;
return e("div", {
attrs: {
id: "index"
}
}, [e("el-row", {
directives: [{
name: "loading",
rawName: "v-loading",
value: t.loading,
expression: "loading"
}]
}, [e("el-col", {
attrs: {
span: 18,
offset: 3
}
}, t._l(t.movies, (function(a) {
return e("el-card", {
key: a.name,
staticClass: "item m-t",
attrs: {
shadow: "hover"
}
}, [e("el-row", [e("el-col", {
attrs: {
xs: 8,
sm: 6,
md: 4
}
}, [e("router-link", {
attrs: {
to: {
name: "detail",
params: {
key: t.transfer(a.id)
}
}
}
}, [e("img", {
staticClass: "cover",
attrs: {
src: a.cover
}
})])], 1), e("el-col", {
staticClass: "p-h",
attrs: {
xs: 9,
sm: 13,
md: 16
}
}, [e("router-link", {
staticClass: "name",
attrs: {
to: {
name: "detail",
params: {
key: t.transfer(a.id)
}
}
}
}, [e("h2", {
staticClass: "m-b-sm"
}, [t._v(t._s(a.name) + " - " + t._s(a.alias))])]), e("div", {
staticClass: "categories"
}, t._l(a.categories, (function(a) {
return e("el-button", {
key: a,
staticClass: "category",
attrs: {
size: "mini",
type: "primary"
}
}, [t._v(t._s(a) + "\n ")])
}
)), 1), e("div", {
staticClass: "m-v-sm info"
}, [e("span", [t._v(t._s(a.regions.join("、")))]), e("span", [t._v(" / ")]), e("span", [t._v(t._s(a.minute) + " 分钟")])]), e("div", {
staticClass: "m-v-sm info"
}, [e("span", [t._v(t._s(a.published_at) + " 上映")])])], 1), e("el-col", {
attrs: {
xs: 5,
sm: 5,
md: 4
}
}, [e("p", {
staticClass: "score m-t-md m-b-n-sm"
}, [t._v(t._s(a.score.toFixed(1)))]), e("p", [e("el-rate", {
attrs: {
value: a.score / 2,
disabled: "",
max: 5,
"text-color": "#ff9900"
}
})], 1)])], 1)], 1)
}
)), 1)], 1), e("el-row", [e("el-col", {
attrs: {
span: 10,
offset: 11
}
}, [e("div", {
staticClass: "pagination m-v-lg"
}, [e("el-pagination", {
attrs: {
background: "",
"current-page": t.page,
"page-size": t.limit,
layout: "total, prev, pager, next",
total: t.total
},
on: {
"current-change": t.onPageChange,
"update:currentPage": function(a) {
t.page = a
},
"update:current-page": function(a) {
t.page = a
}
}
})], 1)])], 1)], 1)
}
, n = []
, i = e("7d92")
, r = e("3e22")
, o = {
name: "Index",
components: {},
data: function() {
return {
loading: !1,
total: null,
page: parseInt(this.$route.params.page || 1),
limit: 10,
movies: null
}
},
mounted: function() {
this.onFetchData()
},
methods: {
transfer: r["a"],
onPageChange: function(t) {
this.$router.push({
name: "indexPage",
params: {
page: t
}
}),
this.onFetchData()
},
onFetchData: function() {
var t = this;
this.loading = !0;
var a = (this.page - 1) * this.limit
, e = Object(i["a"])(this.$store.state.url.index, a);
/**
* 这里用到了JS拦截
* 说明:Object(i["a"])是一个方法名
* 1、定义个全局方法 window.encrypt ,用于绑定Object(i["a"])
* 调用 window.encrypt 这个方法相当于调用 Object(i["a"])
* 2、window.encrypt 这个方法,我们在python中去调用就可以获取得到值
*/
window.encrypt = Object(i["a"])
this.$axios.get(this.$store.state.url.index, {
params: {
limit: this.limit,
offset: a,
token: e
}
}).then((function(a) {
var e = a.data
, s = e.results
, n = e.count;
t.loading = !1,
t.movies = s,
t.total = n
}
))
}
}
}
, l = o
, c = (e("ca9c"),
e("e93d"),
e("2877"))
, u = Object(c["a"])(l, s, n, !1, null, "8a85e5c6", null);
a["default"] = u.exports
},
e93d: function(t, a, e) {
"use strict";
var s = e("79ee")
, n = e.n(s);
n.a
}
}]);
栏目分类全部>