代码如下

# 导入包
import random
import re
import time

import scrapy
from scrapy import FormRequest


class QuotesSpider(scrapy.Spider):
    # 蜘蛛名称 用于执行蜘蛛
    name = "ming"

    # 返回一个可以迭代的请求
    def start_requests(self):
        urls = [
            'https://it.ithome.com/ithome/getajaxdata.aspx',
            'https://win10.ithome.com/ithome/getajaxdata.aspx',
            'https://soft.ithome.com/ithome/getajaxdata.aspx',
            'https://iphone.ithome.com/ithome/getajaxdata.aspx',
            'https://android.ithome.com/ithome/getajaxdata.aspx',
            'https://digi.ithome.com/ithome/getajaxdata.aspx',
            'https://digi.ithome.com/ithome/getajaxdata.aspx',
            'https://next.ithome.com/ithome/getajaxdata.aspx'
        ]

        for url in urls:
            # 发送post请求
            for count in range(0, 2800):
                frmdata = {"categoryid":'32',"type":"pccategorypage","page":str(count)}
                yield FormRequest(url, callback=self.parse, formdata=frmdata)

    # 调用方法,为每个请求处理下载 运行的时候会调度到此方法
    def parse(self, response):
        for url in response.css('h2 a::attr(href)'):
            if (re.match('^((https|http|ftp|rtsp|mms)?:\/\/)[^\s]+.htm', url.get())):
                yield scrapy.Request(url.get(), callback=self.parse)

        if (re.match('^((https|http|ftp|rtsp|mms)?:\/\/)[^\s]+.htm', response.url)):
            yield {
                    'title': response.css('.post_title h1::text').get(),
                    'body': response.css('#paragraph').get()
                }