{"id":204387,"date":"2025-05-29T14:15:57","date_gmt":"2025-05-29T06:15:57","guid":{"rendered":"https:\/\/server.hk\/cnblog\/204387\/"},"modified":"2025-05-29T14:15:57","modified_gmt":"2025-05-29T06:15:57","slug":"%e5%a6%82%e4%bd%95%e5%ae%9a%e5%88%b6%e5%8c%96%e5%a4%84%e7%90%86crawlspider%e4%b8%adrule%e8%a7%a3%e6%9e%90%e8%bf%87%e7%9a%84%e9%93%be%e6%8e%a5%ef%bc%9f","status":"publish","type":"post","link":"https:\/\/server.hk\/cnblog\/204387\/","title":{"rendered":"\u5982\u4f55\u5b9a\u5236\u5316\u5904\u7406CrawlSpider\u4e2dRule\u89e3\u6790\u8fc7\u7684\u94fe\u63a5\uff1f"},"content":{"rendered":"<p><b><\/b>     <\/p>\n<h1>\u5982\u4f55\u5b9a\u5236\u5316\u5904\u7406CrawlSpider\u4e2dRule\u89e3\u6790\u8fc7\u7684\u94fe\u63a5\uff1f<\/h1>\n<p>\u4f60\u5728\u5b66\u4e60\u76f8\u5173\u7684\u77e5\u8bc6\u5417\uff1f\u672c\u6587<span style=\"color: #FF6600;, Helvetica, Arial, sans-serif;font-size: 14px;background-color: #FFFFFF\">\u300a\u5982\u4f55\u5b9a\u5236\u5316\u5904\u7406CrawlSpider\u4e2dRule\u89e3\u6790\u8fc7\u7684\u94fe\u63a5\uff1f\u300b<\/span>\uff0c\u4e3b\u8981\u4ecb\u7ecd\u7684\u5185\u5bb9\u5c31\u6d89\u53ca\u5230<span style=\"color: #FF6600;, Helvetica, Arial, sans-serif;font-size: 14px;background-color: #FFFFFF\"><\/span>\uff0c\u5982\u679c\u4f60\u60f3\u63d0\u5347\u81ea\u5df1\u7684\u5f00\u53d1\u80fd\u529b\uff0c\u5c31\u4e0d\u8981\u9519\u8fc7\u8fd9\u7bc7\u6587\u7ae0\uff0c\u5927\u5bb6\u8981\u77e5\u9053\u7f16\u7a0b\u7406\u8bba\u57fa\u7840\u548c\u5b9e\u6218\u64cd\u4f5c\u90fd\u662f\u4e0d\u53ef\u6216\u7f3a\u7684\u54e6\uff01<\/p>\n<p><img decoding=\"async\" src=\"https:\/\/www.17golang.com\/uploads\/20241106\/1730875540672b109436553.jpg\" class=\"aligncenter\"><\/p>\n<p><strong>\u5982\u4f55\u9488\u5bf9crawlspider\u4e2d\u7684rule\u89e3\u6790\u8fc7\u7684\u94fe\u63a5\u8fdb\u884c\u5b9a\u5236\u5316\u5904\u7406<\/strong><\/p>\n<p>scrapy\u6846\u67b6\u4e2d\u7684crawlspider\u4e3a\u722c\u866b\u5b9a\u5236\u5f00\u53d1\u63d0\u4f9b\u4e86\u7075\u6d3b\u6027\u3002\u5728rule\u4e2d\u8bbe\u7f6elinkextractor\u540e\uff0c\u6211\u4eec\u53ef\u4ee5\u7ee7\u7eed\u5bf9\u89e3\u6790\u540e\u7684\u94fe\u63a5\u8fdb\u884c\u5b9a\u5236\u5316\u5904\u7406\u3002\u4f8b\u5982\uff0c\u6211\u4eec\u53ef\u80fd\u4f1a\u5e0c\u671b\u5bf9\u8be6\u60c5\u9875\u94fe\u63a5\u8fdb\u884c\u989d\u5916\u7684\u5904\u7406\u3002<\/p>\n<p><strong>\u89e3\u51b3\u65b9\u6848\uff1a<\/strong><\/p>\n<p>\u4e3a\u4e86\u5bf9rule\u89e3\u6790\u8fc7\u7684\u94fe\u63a5\u8fdb\u884c\u4fee\u6539\uff0c\u6211\u4eec\u9700\u8981\u5728downloadermiddleware\u4e2d\u5b9a\u4e49process_requests\u65b9\u6cd5\u3002\u8fd9\u662f\u4e00\u4e2a\u5168\u5c40\u65b9\u6cd5\uff0c\u5b83\u5c06\u5904\u7406\u6240\u6709\u4f20\u5165\u7684\u8bf7\u6c42\u3002\u5728\u8be5\u65b9\u6cd5\u4e2d\uff0c\u6211\u4eec\u53ef\u4ee5\u68c0\u67e5\u6bcf\u4e2a\u8bf7\u6c42\u7684url\uff0c\u5e76\u6839\u636e\u9700\u8981\u8fdb\u884c\u5904\u7406\u3002<\/p>\n<p><strong>\u4ee3\u7801\u793a\u4f8b\uff1a<\/strong><\/p>\n<pre>def process_requests(self, request, spider):\n    if request.url.endswith(\".html\"):\n        return scrapy.http.HtmlResponse(url=request.url, body=b\"\", encoding=\"utf-8\")<\/pre>\n<p>\u5728\u8fd9\u4e2a\u793a\u4f8b\u4e2d\uff0c\u6211\u4eec\u68c0\u67e5\u8bf7\u6c42\u7684url\u662f\u5426\u4ee5&#8221;.html&#8221;\u7ed3\u5c3e\u3002\u5982\u679c\u662f\u7684\u8bdd\uff0c\u6211\u4eec\u5c06\u8fd4\u56de\u4e00\u4e2a\u65b0\u7684htmlresponse\u5bf9\u8c61\uff0c\u5176\u4e2durl\u548c\u6b63\u6587\u4e3a\u7a7a\uff0c\u7f16\u7801\u4e3autf-8\u3002<\/p>\n<p>\u901a\u8fc7\u8fd9\u79cd\u65b9\u6cd5\uff0c\u6211\u4eec\u53ef\u4ee5\u8f7b\u677e\u5730\u4fee\u6539rule\u89e3\u6790\u8fc7\u7684\u94fe\u63a5\uff0c\u6ee1\u8db3\u6211\u4eec\u7279\u5b9a\u7684\u9700\u6c42\u3002<\/p>\n<p>\u7406\u8bba\u8981\u638c\u63e1\uff0c\u5b9e\u64cd\u4e0d\u80fd\u843d\uff01\u4ee5\u4e0a\u5173\u4e8e\u300a\u5982\u4f55\u5b9a\u5236\u5316\u5904\u7406CrawlSpider\u4e2dRule\u89e3\u6790\u8fc7\u7684\u94fe\u63a5\uff1f\u300b\u7684\u8be6\u7ec6\u4ecb\u7ecd\uff0c\u5927\u5bb6\u90fd\u638c\u63e1\u4e86\u5427\uff01\u5982\u679c\u60f3\u8981\u7ee7\u7eed\u63d0\u5347\u81ea\u5df1\u7684\u80fd\u529b\uff0c\u90a3\u4e48\u5c31\u6765\u5173\u6ce8\u516c\u4f17\u53f7\u5427\uff01<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u5982\u4f55\u5b9a\u5236\u5316\u5904\u7406CrawlSpid&#46;&#46;&#46;<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"closed","ping_status":"","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[4925],"tags":[],"class_list":["post-204387","post","type-post","status-publish","format-standard","hentry","category-4925"],"_links":{"self":[{"href":"https:\/\/server.hk\/cnblog\/wp-json\/wp\/v2\/posts\/204387","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/server.hk\/cnblog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/server.hk\/cnblog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/server.hk\/cnblog\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/server.hk\/cnblog\/wp-json\/wp\/v2\/comments?post=204387"}],"version-history":[{"count":0,"href":"https:\/\/server.hk\/cnblog\/wp-json\/wp\/v2\/posts\/204387\/revisions"}],"wp:attachment":[{"href":"https:\/\/server.hk\/cnblog\/wp-json\/wp\/v2\/media?parent=204387"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/server.hk\/cnblog\/wp-json\/wp\/v2\/categories?post=204387"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/server.hk\/cnblog\/wp-json\/wp\/v2\/tags?post=204387"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}