如何使用scrapy爬取资源,你懂得
- - CSDN博客编程语言推荐文章前言:有没有看点视频感觉到处都是广告,有没有觉得它的播放速度很慢,不要担心,看完这篇文章你就是老司机了. scrapy官方文档上说不建议使用这个方法来安装,但是经过我的实验发现官方网站上的安装方法都不好使,ubuntu上的安装文档如下:. 创建之后在文件目录下面就有了初始的项目结构.
前言:有没有看点视频感觉到处都是广告,有没有觉得它的播放速度很慢,不要担心,看完这篇文章你就是老司机了
1.安装scrapy
sudo apt-get install python-scrapy说明:
scrapy官方文档上说不建议使用这个方法来安装,但是经过我的实验发现官方网站上的安装方法都不好使,ubuntu上的安装文档如下:
http://doc.scrapy.org/en/latest/intro/install.html#ubuntu-9-10-or-above
2.创建项目
sudo scrapy startproject Mp4创建之后在文件目录下面就有了初始的项目结构
3.Talk is cheap. Show me the code废话少说,放“码”过来!
items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class Mp4Item(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() mp4name = scrapy.Field() mp4url = scrapy.Field()
middlewares.py
# -*-coding:utf-8-*- import random from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware from scrapy import log class RotateUserAgentMiddleware(UserAgentMiddleware): def __init__(self,user_agent=''): self.user_agent = user_agent def process_request(self,request,spider): ua = random.choice(self.user_agent_list) if ua: #print 'Current UserAgent: ' + ua request.headers.setdefault('User-Agent',ua) user_agent_list = [\ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ]pipeline.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import json import codecs class Mp4Pipeline(object): def __init__(self): self.file = codecs.open('mp4.json',mode='wb',encoding='utf-8') def process_item(self, item, spider): line = json.dumps(dict(item)) + '\n' self.file.write(line.decode("unicode_escape")) return itemsettings.py
# -*- coding: utf-8 -*- # Scrapy settings for Mp4 project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME = 'Mp4' SPIDER_MODULES = ['Mp4.spiders'] NEWSPIDER_MODULE = 'Mp4.spiders' COOKIES_ENABLED = False ITEM_PIPELINES = { 'Mp4.pipelines.Mp4Pipeline':300 } DOWNLOADER_MIDDLEWARES = { 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware' : None, 'Mp4.middlewares.RotateUserAgentMiddleware' :400 } # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'Mp4 (+http://www.yourdomain.com)' # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS=32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY=3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN=16 #CONCURRENT_REQUESTS_PER_IP=16 # Disable cookies (enabled by default) #COOKIES_ENABLED=False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED=False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'Mp4.middlewares.MyCustomSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'Mp4.middlewares.MyCustomDownloaderMiddleware': 543, #} # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.telnet.TelnetConsole': None, #} # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'Mp4.pipelines.SomePipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html # NOTE: AutoThrottle will honour the standard settings for concurrency and delay #AUTOTHROTTLE_ENABLED=True # The initial download delay #AUTOTHROTTLE_START_DELAY=5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY=60 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG=False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED=True #HTTPCACHE_EXPIRATION_SECS=0 #HTTPCACHE_DIR='httpcache' #HTTPCACHE_IGNORE_HTTP_CODES=[] #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
Mp4CrawlSpider.py
# -*- coding: utf-8 -*- import scrapy from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor from scrapy.selector import Selector from Mp4.items import Mp4Item import urllib class ExampleSpider(CrawlSpider): name = "Mp4CrawlSpider" download_delay = 2 allowed_domains = ["huangsewangzhan.com"] start_urls = [ 'http://huangsewangzhan.com/vodlist/?5.html', 'http://huangsewangzhan.com/vodlist/?6.html', 'http://huangsewangzhan.com/vodlist/?7.html', 'http://huangsewangzhan.com/vodlist/?8.html', 'http://huangsewangzhan.com/vodlist/?9.html' ] rules =[ Rule(LxmlLinkExtractor(allow=('/vodlist/'),restrict_xpaths=('//div[@class="page"]'),process_value='process_value'),callback='parse_item',follow=True) ] def process_value(value): print 'value is ' + value; #value = value[:-1] return value; def parse_item(self,response): item = Mp4Item() sel = Selector(response) print sel mp4url = str(response.url) print 'mp4url is ' + mp4url ''' blog_name = sel.xpath('/a[@id="cb_post_title_url"]/text()').extract() ''' mp4name = 'mp4name'#sel.xpath('//div[@id="article_details"]/div/h1/span/a/text()').extract() item['mp4name'] = [n.encode('utf-8') for n in mp4name] item['mp4url'] = mp4url.encode('utf-8') #yield item #print response.xpath('//div[@class="thumb"]/a/@href') for href in response.xpath('//div[@class="thumb"]/a/@href'): #print href.extract() url = response.urljoin(href.extract()) yield scrapy.Request(url, callback=self.parse_dir_contents) def parse_dir_contents(self, response): body = response.body for url in body.split("'"): if(url.startswith("http") and url.endswith(".mp4")): print "real url is " + url local = url.split('/')[-1] urllib.urlretrieve(url,local) #sel = Selector(response) #print sel.xpath('//div[@id="a1"]') #print sel.xpath('//div[@class="pl"]') #print sel.xpath('//div[@id="pl1111"]') #print sel.xpath('//video[@id="ckplayer_a1"]') #print 'hahahahahahah' + response.url #for sel in response.xpath('//ul/li'): #yield item额,其实前面几段代码都没有用,写得不规范了,没有模块化,这个类负责了主要工作,
这个html的这段代码是javascript生成的,所以没有办法使用xpath获得,我选择了解析字符串,,
功夫不负有心人呀,结果就是这样的