利用scrapy爬取某招聘网站全部职业的部分信息,并异步写入数据库

发布于 / python / 2 条评论

spider部分代码

# -*- coding: utf-8 -*-
import scrapy
from ZhiL import items

class ZhilianSpider(scrapy.Spider):
    name = 'ZhiLian'
    allowed_domains = ['jobs.zhaopin.com']
    start_urls = ['http://jobs.zhaopin.com/']

    def parse(self, response):
        job_urls = response.xpath("//div[@class='listcon']/a/@href").extract()
        for job_url in job_urls:
            job_url = response.urljoin(job_url)
            yield scrapy.Request(url=job_url, callback=self.second_parse)

    def second_parse(self, response):
        is404 = response.xpath("//p[@class='error-content__text']/text()").extract_first()
        if '对不起,您要访问的页面暂时没有找到' == is404:
            return None
        else:
            next_page = response.xpath("//span[@class='search_page_next']/a/@href").extract_first()
            next_page_url =response.urljoin(next_page)
            job_name = response.xpath("//span[@class='post']/a/text()").extract()
            company_name = response.xpath("//span[@class='company_name']/a/text()").extract()
            salary = response.xpath("//span[@class='salary']/text()").extract()
            address = response.xpath("//span[@class='address']/a/text()").extract()
            publish_time = response.xpath("//span[@class='release_time']/text()").extract()
            num = len(job_name)
            item = items.ZhilItem()
            for i in range(num):
                item["job_name"] = job_name[i].strip('')
                item["company_name"] = company_name[i].strip('')
                item["salary"] = salary[i].strip('')
                item["address"] = address[i].strip('')
                item["time"] = publish_time[i].strip('')
                yield item
            yield scrapy.Request(url=next_page_url, callback=self.second_parse)

items代码

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class ZhilItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    job_name = scrapy.Field()
    address = scrapy.Field()
    company_name = scrapy.Field()
    salary = scrapy.Field()
    time = scrapy.Field()

    def get_insert_sql(self):
        insert_sql = """
                          insert into job(job_name, address, company, salary, publish)
                          VALUES (%s,%s,%s,%s,%s)
 
                    """
        params = (
                    self['job_name'], self['address'], self['company_name'], self['salary'],
                    self['time']
                )
        return insert_sql, params

异步写入的管道代码

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from twisted.enterprise import adbapi
import pymysql
import pymysql.cursors


class ZhilPipeline(object):
    def process_item(self, item, spider):
        return item


class MysqlTwistedPipeline(object):
    def __init__(self, dbpool):
        self.dbpool = dbpool

    @classmethod
    def from_settings(cls, settings):
        dbpool = adbapi.ConnectionPool("pymysql", host=settings["MYSQL_HOST"], db=settings["MYSQL_DBNAME"],
                                       user=settings["MYSQL_USER"], password=settings["MYSQL_PASSWORD"], charset="utf8",
                                       cursorclass=pymysql.cursors.DictCursor,
                                        use_unicode = True)
        return cls(dbpool)

    def process_item(self, item, spider):
        self.dbpool.runInteraction(self.do_insert, item)

    def do_insert(self, cursor, item):
        insert_sql, params = item.get_insert_sql()
        cursor.execute(insert_sql, params)

setting里配置好的你的mysql,并启用你的管道

MYSQL_HOST = '127.0.0.1'
MYSQL_DBNAME = 'job'
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'lindong'
ITEM_PIPELINES = {
  'ZhiL.pipelines.MysqlTwistedPipeline': 300,
}

然后我们就可以拿到数据了

转载原创文章请注明,转载自: ت » 利用scrapy爬取某招聘网站全部职业的部分信息,并异步写入数据库
  1. avatar

    不错,有点东西

  2. avatar

    我是林冬的儿子,林冬真棒