python爬虫实践之网页抓取
- - CSDN博客推荐文章python自带了urllib和urllib2模块,以及第三方的requests库来抓取网页,这里我们使用easy_install包管理工具下载requests库,BeautifulSoup库,在CMD命令行下,切换到easy_install的目录,运行命令easy_install 包名称. 安装好requests包之后,我们就可以选择使用urllib,urllib2或requests库来抓取网页了.
easy_install requests
安装好requests包之后,我们就可以选择使用urllib,urllib2或requests库来抓取网页了
#! /usr/bin/env python #coding:utf-8 import urllib import urllib2 import requests import sys url = 'http://www.csdn.net' def urllib2Test(): req = urllib2.Request(url) response = urllib2.urlopen(req) thePage = response.read() def requestsTest(): r = requests.get(url) r.status_code r.content r.headers def urllib2TestEx(url): req = urllib2.Request(url) try: response = urllib2.urlopen(req) content = response.read() except urllib2.URLError,e: print e.reason def urlhttperror(url): req = urllib2.Request(url) try:urllib2.urlopen(req) except urllib2.HTTPError,e: print e.read() if __name__ == '__main__': urllib2Test() requestsTest() urllib2TestEx(url) urlhttperror(url)
#! /usr/bin/env python #coding:utf-8 import requests from BeautifulSoup import BeautifulSoup from os.path import dirname, abspath import sys import os #PREFIX = dirname(abspath(__file__)) ## 这段代码是用于解决中文报错的问题 reload(sys) sys.setdefaultencoding("utf8") ##################################################### defaultWaitTime = 1 def getHtmlContent(url): global defaultWaitTime content = None retry = 0 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36'} AccessFrequency = defaultWaitTime while retry < 5: try: r = requests.get(url,timeout=10,headers=headers) content = r.content return content except: retry+=1 time.sleep(AccessFrequency) return content def Test(): content = getHtmlContent("http://blog.csdn.net/figo829/article/details/18015537") # print content if __name__ == '__main__': Test()