python 抓取网页保存文件-Hqman

python 抓取网页保存文件

#!/usr/bin/env python

#-*- coding: utf-8 -*-

import os

from os.path import join, exists

import urllib2

def getRequest(url):

request = urllib2.Request(url)

request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)')

try:

response = urllib2.urlopen(request,timeout=20)

return response.read() #.decode('gbk','ignore').encode('utf-8') #.replace(u'�','')

except Exception,e:

print "erorr %s %s" % (url,e)

return None

except urllib2.HTTPError, e:

print e.code

def saveToFile(filepath,filename,content):

#with open(join("/data/scrapy/comms/", filename), 'wb') as f:

with open(join(filepath,filename), 'wb') as f:

f.write(content)

if __name__ == '__main__':

htmlstr=getRequest('https://www.google.com')

saveToFile('/home/hqman','google.html',htmlstr)