python爬虫实践之网页抓取
- - CSDN博客推荐文章python自带了urllib和urllib2模块,以及第三方的requests库来抓取网页,这里我们使用easy_install包管理工具下载requests库,BeautifulSoup库,在CMD命令行下,切换到easy_install的目录,运行命令easy_install 包名称. 安装好requests包之后,我们就可以选择使用urllib,urllib2或requests库来抓取网页了.
easy_install requests
安装好requests包之后,我们就可以选择使用urllib,urllib2或requests库来抓取网页了
#! /usr/bin/env python #coding:utf-8 import urllib import urllib2 import requests import sys url = 'http://www.csdn.net' def urllib2Test(): req = urllib2.Request(url) response = urllib2.urlopen(req) thePage = response.read() def requestsTest(): r = requests.get(url) r.status_code r.content r.headers def urllib2TestEx(url): req = urllib2.Request(url) try: response = urllib2.urlopen(req) content = response.read() except urllib2.URLError,e: print e.reason def urlhttperror(url): req = urllib2.Request(url) try:urllib2.urlopen(req) except urllib2.HTTPError,e: print e.read() if __name__ == '__main__': urllib2Test() requestsTest() urllib2TestEx(url) urlhttperror(url)
#! /usr/bin/env python
#coding:utf-8
import requests
from BeautifulSoup import BeautifulSoup
from os.path import dirname, abspath
import sys
import os
#PREFIX = dirname(abspath(__file__))
## 这段代码是用于解决中文报错的问题
reload(sys)
sys.setdefaultencoding("utf8")
#####################################################
defaultWaitTime = 1
def getHtmlContent(url):
global defaultWaitTime
content = None
retry = 0
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36'}
AccessFrequency = defaultWaitTime
while retry < 5:
try:
r = requests.get(url,timeout=10,headers=headers)
content = r.content
return content
except:
retry+=1
time.sleep(AccessFrequency)
return content
def Test():
content = getHtmlContent("http://blog.csdn.net/figo829/article/details/18015537")
# print content
if __name__ == '__main__':
Test()