Hqman

python 抓取网页 保存文件

#!/usr/bin/env python

#-*- coding: utf-8 -*-

import os

from os.path import join, exists

import urllib2

def getRequest(url):

    request = urllib2.Request(url)

    request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)')

    try:

        try:

            response = urllib2.urlopen(request,timeout=20)

            return response.read() #.decode('gbk','ignore').encode('utf-8') #.replace(u'�','')

        except Exception,e:

            print "erorr %s %s" % (url,e)

            return None


    except urllib2.HTTPError, e:

        print e.code



def saveToFile(filepath,filename,content):

    #with open(join("/data/scrapy/comms/", filename), 'wb') as f:

    with open(join(filepath,filename), 'wb') as f:

            f.write(content)



if __name__ == '__main__':

    htmlstr=getRequest('http://www.google.com')

    saveToFile('/home/hqman','google.html',htmlstr)

上一篇 下一篇

© Hqman | Powered by LOFTER