估摸着各位小伙伴儿被想使用CrawlSpider的Rule来抓取JS,相当受折磨;
CrawlSpider Rule总是不能和Splash结合。
废话不多说,手疼····
方法1:
写一个自定义的函数,使用Rule中的process_request参数;来替换掉Rule本身Request的逻辑。
参考官方文档:
1、将请求更换为SplashRequest请求:
2、每次请求将本次请求的URL使用Meta参数传递下去;
3、重写 _requests_to_follow 方法:替换响应Response的URL为我们传递的URL(否则会格式为Splash的地址)
就像下面这样
class MySpider(CrawlSpider): name = 'innda' def start_requests(self): yield SplashRequest(url, dont_process_response=True, args={'wait': 0.5}, meta={'real_url': url}) rules = ( Rule(LinkExtractor(allow=('node_d+.htm',)), process_request='splash_request', follow=True), Rule(LinkExtractor(allow=('content_d+.htm',)), callback="one_parse") ) def splash_request(self, request): """ :param request: Request对象(是一个字典;怎么取值就不说了吧!!) :return: SplashRequest的请求 """ # dont_process_response=True 参数表示不更改响应对象类型(默认为:HTMLResponse;更改后为:SplashTextResponse) # args={'wait': 0.5} 表示传递等待参数0.5(Splash会渲染0.5s的时间) # meta 传递请求的当前请求的URL return SplashRequest(url=request.url, dont_process_response=True, args={'wait': 0.5}, meta={'real_url': request.url}) def _requests_to_follow(self, response): """重写的函数哈!这个函数是Rule的一个方法 :param response: 这货是啥看名字都知道了吧(这货也是个字典,然后你懂的d(・∀・*)♪゚) :return: 追踪的Request """ if not isinstance(response, HtmlResponse): return seen = set() # 将Response的URL更改为我们传递下来的URL # 需要注意哈! 不能直接直接改!只能通过Response.replace这个魔术方法来改!(当然你改无所谓啦!反正会用报错来报复你 (`皿´) )并且!!! # 敲黑板!!!!划重点!!!!!注意了!!! 这货只能赋给一个新的对象(你说变量也行,怎么说都行!(*゚∀゚)=3) newresponse = response.replace(url=response.meta.get('real_url')) for n, rule in enumerate(self._rules): # 我要长一点不然有人看不见------------------------------------newresponse 看见没!别忘了改!!! links = [lnk for lnk in rule.link_extractor.extract_links(newresponse) if lnk not in seen] if links and rule.process_links: links = rule.process_links(links) for link in links: seen.add(link) r = self._build_request(n, link) yield rule.process_request(r) def one_parse(self, response): print(response.url)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
|
class
MySpider
(
CrawlSpider
)
:
name
=
'innda'
def
start_requests
(
self
)
:
yield
SplashRequest
(
url
,
dont_process_response
=
True
,
args
=
{
'wait'
:
0.5
}
,
meta
=
{
'real_url'
:
url
}
)
rules
=
(
Rule
(
LinkExtractor
(
allow
=
(
'node_d+.htm'
,
)
)
,
process_request
=
'splash_request'
,
follow
=
True
)
,
Rule
(
LinkExtractor
(
allow
=
(
'content_d+.htm'
,
)
)
,
callback
=
"one_parse"
)
)
def
splash_request
(
self
,
request
)
:
"""
:param request: Request对象(是一个字典;怎么取值就不说了吧!!)
:return: SplashRequest的请求
"""
# dont_process_response=True 参数表示不更改响应对象类型(默认为:HTMLResponse;更改后为:SplashTextResponse)
# args={'wait': 0.5} 表示传递等待参数0.5(Splash会渲染0.5s的时间)
# meta 传递请求的当前请求的URL
return
SplashRequest
(
url
=
request
.
url
,
dont_process_response
=
True
,
args
=
{
'wait'
:
0.5
}
,
meta
=
{
'real_url'
:
request
.
url
}
)
def
_requests_to_follow
(
self
,
response
)
:
"""重写的函数哈!这个函数是Rule的一个方法
:param response: 这货是啥看名字都知道了吧(这货也是个字典,然后你懂的d(・∀・*)♪゚)
:return: 追踪的Request
"""
if
not
isinstance
(
response
,
HtmlResponse
)
:
return
seen
=
set
(
)
# 将Response的URL更改为我们传递下来的URL
# 需要注意哈! 不能直接直接改!只能通过Response.replace这个魔术方法来改!(当然你改无所谓啦!反正会用报错来报复你 (`皿´) )并且!!!
# 敲黑板!!!!划重点!!!!!注意了!!! 这货只能赋给一个新的对象(你说变量也行,怎么说都行!(*゚∀゚)=3)
newresponse
=
response
.
replace
(
url
=
response
.
meta
.
get
(
'real_url'
)
)
for
n
,
rule
in
enumerate
(
self
.
_rules
)
:
# 我要长一点不然有人看不见------------------------------------newresponse 看见没!别忘了改!!!
links
=
[
lnk
for
lnk
in
rule
.
link_extractor
.
extract_links
(
newresponse
)
if
lnk
not
in
seen
]
if
links
and
rule
.
process_links
:
links
=
rule
.
process_links
(
links
)
for
link
in
links
:
seen
.
add
(
link
)
r
=
self
.
_build_request
(
n
,
link
)
yield
rule
.
process_request
(
r
)
def
one_parse
(
self
,
response
)
:
print
(
response
.
url
)
|
方法2:
这就很简单啦!干掉类型检查就是了(/≧▽≦)/
就像这样:
class MySpider(CrawlSpider): name = 'innda' def start_requests(self): yield SplashRequest(url, args={'wait': 0.5}) rules = ( Rule(LinkExtractor(allow=('node_d+.htm',)), process_request='splash_request', follow=True), Rule(LinkExtractor(allow=('content_d+.htm',)), callback="one_parse") ) def splash_request(self, request): """ :param request: Request对象(是一个字典;怎么取值就不说了吧!!) :return: SplashRequest的请求 """ # dont_process_response=True 参数表示不更改响应对象类型(默认为:HTMLResponse;更改后为:SplashTextResponse) # args={'wait': 0.5} 表示传递等待参数0.5(Splash会渲染0.5s的时间) # meta 传递请求的当前请求的URL return SplashRequest(url=request.url, args={'wait': 0.5}) def _requests_to_follow(self, response): """重写的函数哈!这个函数是Rule的一个方法 :param response: 这货是啥看名字都知道了吧(这货也是个字典,然后你懂的d(・∀・*)♪゚) :return: 追踪的Request """ # *************请注意我就是被注释注释掉的类型检查o(TωT)o # if not isinstance(response, HtmlResponse): # return # ************************************************ seen = set() # 将Response的URL更改为我们传递下来的URL # 需要注意哈! 不能直接直接改!只能通过Response.replace这个魔术方法来改!并且!!! # 敲黑板!!!!划重点!!!!!注意了!!! 这货只能赋给一个新的对象(你说变量也行,怎么说都行!(*゚∀゚)=3) # newresponse = response.replace(url=response.meta.get('real_url')) for n, rule in enumerate(self._rules): # 我要长一点不然有人看不见------------------------------------newresponse 看见没!别忘了改!!! links = [lnk for lnk in rule.link_extractor.extract_links(response) if lnk not in seen] if links and rule.process_links: links = rule.process_links(links) for link in links: seen.add(link) r = self._build_request(n, link) yield rule.process_request(r)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
class
MySpider
(
CrawlSpider
)
:
name
=
'innda'
def
start_requests
(
self
)
:
yield
SplashRequest
(
url
,
args
=
{
'wait'
:
0.5
}
)
rules
=
(
Rule
(
LinkExtractor
(
allow
=
(
'node_d+.htm'
,
)
)
,
process_request
=
'splash_request'
,
follow
=
True
)
,
Rule
(
LinkExtractor
(
allow
=
(
'content_d+.htm'
,
)
)
,
callback
=
"one_parse"
)
)
def
splash_request
(
self
,
request
)
:
"""
:param request: Request对象(是一个字典;怎么取值就不说了吧!!)
:return: SplashRequest的请求
"""
# dont_process_response=True 参数表示不更改响应对象类型(默认为:HTMLResponse;更改后为:SplashTextResponse)
# args={'wait': 0.5} 表示传递等待参数0.5(Splash会渲染0.5s的时间)
# meta 传递请求的当前请求的URL
return
SplashRequest
(
url
=
request
.
url
,
args
=
{
'wait'
:
0.5
}
)
def
_requests_to_follow
(
self
,
response
)
:
"""重写的函数哈!这个函数是Rule的一个方法
:param response: 这货是啥看名字都知道了吧(这货也是个字典,然后你懂的d(・∀・*)♪゚)
:return: 追踪的Request
"""
# *************请注意我就是被注释注释掉的类型检查o(TωT)o
# if not isinstance(response, HtmlResponse):
# return
# ************************************************
seen
=
set
(
)
# 将Response的URL更改为我们传递下来的URL
# 需要注意哈! 不能直接直接改!只能通过Response.replace这个魔术方法来改!并且!!!
# 敲黑板!!!!划重点!!!!!注意了!!! 这货只能赋给一个新的对象(你说变量也行,怎么说都行!(*゚∀゚)=3)
# newresponse = response.replace(url=response.meta.get('real_url'))
for
n
,
rule
in
enumerate
(
self
.
_rules
)
:
# 我要长一点不然有人看不见------------------------------------newresponse 看见没!别忘了改!!!
links
=
[
lnk
for
lnk
in
rule
.
link_extractor
.
extract_links
(
response
)
if
lnk
not
in
seen
]
if
links
and
rule
.
process_links
:
links
=
rule
.
process_links
(
links
)
for
link
in
links
:
seen
.
add
(
link
)
r
=
self
.
_build_request
(
n
,
link
)
yield
rule
.
process_request
(
r
)
|
以上完毕@_@!!