spider部分代码
# -*- coding: utf-8 -*-
import scrapy
from ZhiL import items
class ZhilianSpider(scrapy.Spider):
name = 'ZhiLian'
allowed_domains = ['jobs.zhaopin.com']
start_urls = ['http://jobs.zhaopin.com/']
def parse(self, response):
job_urls = response.xpath("//div[@class='listcon']/a/@href").extract()
for job_url in job_urls:
job_url = response.urljoin(job_url)
yield scrapy.Request(url=job_url, callback=self.second_parse)
def second_parse(self, response):
is404 = response.xpath("//p[@class='error-content__text']/text()").extract_first()
if '对不起,您要访问的页面暂时没有找到' == is404:
return None
else:
next_page = response.xpath("//span[@class='search_page_next']/a/@href").extract_first()
next_page_url =response.urljoin(next_page)
job_name = response.xpath("//span[@class='post']/a/text()").extract()
company_name = response.xpath("//span[@class='company_name']/a/text()").extract()
salary = response.xpath("//span[@class='salary']/text()").extract()
address = response.xpath("//span[@class='address']/a/text()").extract()
publish_time = response.xpath("//span[@class='release_time']/text()").extract()
num = len(job_name)
item = items.ZhilItem()
for i in range(num):
item["job_name"] = job_name[i].strip('')
item["company_name"] = company_name[i].strip('')
item["salary"] = salary[i].strip('')
item["address"] = address[i].strip('')
item["time"] = publish_time[i].strip('')
yield item
yield scrapy.Request(url=next_page_url, callback=self.second_parse)
items代码
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class ZhilItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
job_name = scrapy.Field()
address = scrapy.Field()
company_name = scrapy.Field()
salary = scrapy.Field()
time = scrapy.Field()
def get_insert_sql(self):
insert_sql = """
insert into job(job_name, address, company, salary, publish)
VALUES (%s,%s,%s,%s,%s)
"""
params = (
self['job_name'], self['address'], self['company_name'], self['salary'],
self['time']
)
return insert_sql, params
异步写入的管道代码
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from twisted.enterprise import adbapi
import pymysql
import pymysql.cursors
class ZhilPipeline(object):
def process_item(self, item, spider):
return item
class MysqlTwistedPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
dbpool = adbapi.ConnectionPool("pymysql", host=settings["MYSQL_HOST"], db=settings["MYSQL_DBNAME"],
user=settings["MYSQL_USER"], password=settings["MYSQL_PASSWORD"], charset="utf8",
cursorclass=pymysql.cursors.DictCursor,
use_unicode = True)
return cls(dbpool)
def process_item(self, item, spider):
self.dbpool.runInteraction(self.do_insert, item)
def do_insert(self, cursor, item):
insert_sql, params = item.get_insert_sql()
cursor.execute(insert_sql, params)
setting里配置好的你的mysql,并启用你的管道
MYSQL_HOST = '127.0.0.1'
MYSQL_DBNAME = 'job'
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'lindong'
ITEM_PIPELINES = {
'ZhiL.pipelines.MysqlTwistedPipeline': 300,
}
然后我们就可以拿到数据了

不错,有点东西
我是林冬的儿子,林冬真棒