糗事百科之抓取多个页面

修改之前的 qsbk.py 文件

# -*- coding: utf-8 -*-
import scrapy

# 从根目录 导入 item 类
from spider.items import SpiderItem


class QsbkSpider(scrapy.Spider):
    name = 'qsbk'
    allowed_domains = ['www.qiushibaike.com']
    start_urls = ['https://www.qiushibaike.com/text/page/1/']
    base_domain = "https://www.qiushibaike.com"

    def parse(self, response):
        contents = response.xpath('//div[@class="col1 old-style-col1"]/div')
        for duanzi in contents:
            author = duanzi.xpath(".//h2/text()").get().strip()
            content = duanzi.xpath('.//div[@class="content"]//text()').getall()
            content = ' '.join(content).strip()
            item = SpiderItem(author=author, content=content)
            yield item

        # 查找下页的 url
        # 查找最后一个 li 标签的 href 属性,
        # 若为空,表示当前页为最后一页,返回
        # 否在, 请求下页的 url 获取内容
        next_url = response.xpath('//div[@class="col1 old-style-col1"]'
                                  '/ul/li[last()]/a/@href').get()
        if not next_url:
            return
        else:
            yield scrapy.Request(self.base_domain + next_url, callback=self.parse)

查找下页的 url 利用 scrapy.Request() 请求下一页,并利用回调函数进行解析html

Update time: 2020-05-27

results matching ""

    No results matching ""