settings.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. # Scrapy settings for gzh_spider project
  2. #
  3. # For simplicity, this file contains only settings considered important or
  4. # commonly used. You can find more settings consulting the documentation:
  5. #
  6. # https://docs.scrapy.org/en/latest/topics/settings.html
  7. # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
  8. # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  9. BOT_NAME = "gzh_spider"
  10. SPIDER_MODULES = ["gzh_spider.spiders"]
  11. NEWSPIDER_MODULE = "gzh_spider.spiders"
  12. # Crawl responsibly by identifying yourself (and your website) on the user-agent
  13. #USER_AGENT = "gzh_spider (+http://www.yourdomain.com)"
  14. # Obey robots.txt rules
  15. ROBOTSTXT_OBEY = False
  16. LOG_LEVEL = "ERROR"
  17. # Configure maximum concurrent requests performed by Scrapy (default: 16)
  18. #CONCURRENT_REQUESTS = 32
  19. # Configure a delay for requests for the same website (default: 0)
  20. # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
  21. # See also autothrottle settings and docs
  22. #DOWNLOAD_DELAY = 3
  23. # The download delay setting will honor only one of:
  24. #CONCURRENT_REQUESTS_PER_DOMAIN = 16
  25. #CONCURRENT_REQUESTS_PER_IP = 16
  26. # Disable cookies (enabled by default)
  27. #COOKIES_ENABLED = False
  28. # Disable Telnet Console (enabled by default)
  29. #TELNETCONSOLE_ENABLED = False
  30. # Override the default request headers:
  31. #DEFAULT_REQUEST_HEADERS = {
  32. # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  33. # "Accept-Language": "en",
  34. #}
  35. # Enable or disable spider middlewares
  36. # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  37. #SPIDER_MIDDLEWARES = {
  38. # "gzh_spider.middlewares.GzhSpiderSpiderMiddleware": 543,
  39. #}
  40. # Enable or disable downloader middlewares
  41. # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
  42. #DOWNLOADER_MIDDLEWARES = {
  43. # "gzh_spider.middlewares.GzhSpiderDownloaderMiddleware": 543,
  44. #}
  45. # Enable or disable extensions
  46. # See https://docs.scrapy.org/en/latest/topics/extensions.html
  47. #EXTENSIONS = {
  48. # "scrapy.extensions.telnet.TelnetConsole": None,
  49. #}
  50. # Configure item pipelines
  51. # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
  52. ITEM_PIPELINES = {
  53. "gzh_spider.pipelines.GzhSpiderPipeline": 300,
  54. }
  55. # Enable and configure the AutoThrottle extension (disabled by default)
  56. # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
  57. #AUTOTHROTTLE_ENABLED = True
  58. # The initial download delay
  59. #AUTOTHROTTLE_START_DELAY = 5
  60. # The maximum download delay to be set in case of high latencies
  61. #AUTOTHROTTLE_MAX_DELAY = 60
  62. # The average number of requests Scrapy should be sending in parallel to
  63. # each remote server
  64. #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
  65. # Enable showing throttling stats for every response received:
  66. #AUTOTHROTTLE_DEBUG = False
  67. # Enable and configure HTTP caching (disabled by default)
  68. # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
  69. #HTTPCACHE_ENABLED = True
  70. #HTTPCACHE_EXPIRATION_SECS = 0
  71. #HTTPCACHE_DIR = "httpcache"
  72. #HTTPCACHE_IGNORE_HTTP_CODES = []
  73. #HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
  74. # Set settings whose default value is deprecated to a future-proof value
  75. REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
  76. TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
  77. FEED_EXPORT_ENCODING = "utf-8"