- # coding:utf-8
- from login163 import *
- from xml.parsers import expat
- import MysqLdb
- class mail163(Login163):
- '''
- get 'limit' unread mails at once,the data format is xml
- if 'subject' in xml data,then return the data,else return None
- '''
- def get_unread_mail(self,start,limit):
- postdata = {
- 'var':'<?xml version="1.0"?><object><int name="fid">1</int><boolean name="skipLockedFolders">false</boolean><string name="order">date</string><boolean name="desc">true</boolean><int name="start">'+str(start)+'</int><int name="limit">'+str(limit)+'</int><boolean name="topFirst">false</boolean><object name="filterFlags"><boolean name="read">false</boolean></object><boolean name="returnTotal">true</boolean><boolean name="returnTag">true</boolean></object>'
- }
- postdata = urllib.urlencode(postdata)
- url = 'http://twebmail.mail.163.com/js5/s?sid='+self.sid+'&func=mBox:listMessages&deftabclick=t2&deftabclick=undefined&from=toolbar&type=unread&mBoxentry=1'
- req = urllib2.Request(url=url,data=postdata,headers=self.headers)
- res = urllib2.urlopen(req).read()
- if 'subject' in res:
- return res
- else:
- return None
- '''
- xml data format,then return the data
- '''
- def format(self,xml_data):
- pattern = re.compile(r'<object name="ctrls">.*?</object>|<object name="flags" />|<object name="flags">.*?</object>',re.S)
- xml_data = pattern.sub('',xml_data)
- pattern = re.compile(r'<string name="from">.*?;(.*?@.*?)&.*?</string>')
- xml_data = pattern.sub(r'<string name="from">\1</string>',xml_data)
- pattern = re.compile(r'<string name="to">.*?;(.*?@.*?)&.*?</string>')
- xml_data = pattern.sub(r'<string name="to">\1</string>',xml_data)
- return xml_data
- #db connect
- class Db_Connect(object):
- def __init__(self,db_host,user,pwd,db_name,charset="utf8",use_unicode = True):
- try:
- self.conn = MysqLdb.Connection(db_host,charset=charset,use_unicode=use_unicode)
- except MysqLdb.OperationalError,e:
- print 'Connect %s Failed' % db_host
- print e.args
- sys.exit(1)
- def insert(self,sql):
- try:
- n = self.conn.cursor().execute(sql)
- return n
- except MysqLdb.Warning,e:
- print e.args
- except MysqLdb.IntegrityError,e:
- print e.args
- def close(self):
- self.conn.close()
- class Mail_Handler(object):
- def __init__(self,data,db_conn):
- self.flag = False # control the data update
- self.mail = {} # a mail info
- self.curr_attrib = ''
- self.data = data # xml data
- self.db_conn = db_conn
- def start(self,name,attributes):
- if name == 'object':
- self.mail = {}
- # get the value of the attribute
- # <string name="id">sdosod0sdfsd</string>
- # the value is "id"
- values = attributes.values()
- if len(values):
- self.curr_attrib = values[0]
- self.flag = True
- def end(self,name):
- sql = "insert into mails(id,from_mail,to_mail,subject,size) values('%s','%s',%d)"
- fields = ('id','from','to','subject','size')
- if name == 'object':
- #print self.mail
- values = [self.mail[i] for i in fields]
- values[-1] = int(values[-1]) # the size type is int
- values = tuple(values)
- #print values
- #print sql % values
- self.db_conn.insert(sql % values)
- self.flag = False
- def character(self,data):
- if self.flag:
- self.mail[self.curr_attrib] = data
- def parser(self):
- p = expat.ParserCreate()
- p.StartElementHandler = self.start
- p.EndElementHandler = self.end
- p.CharacterDataHandler = self.character
- p.Parse(self.data) # parse xml data
- def main():
- flag = True
- db_conn = Db_Connect('192.168.110.142','admin','test')
- username = raw_input('Enter you email:')
- password = getpass.getpass('Enter you password:')
- login = mail163(username,password)
- sid = login.login() # login the 163 mail for getting sid
- # login success
- if sid:
- start = 0 # the start page
- limit = 5 # read 5 unread mails at once
- while flag:
- res = login.get_unread_mail(start,limit)
- if res is None:
- flag = False
- else:
- res = login.format(res) # use re module format data
- parser = Mail_Handler(res,db_conn) # use expat parse xml
- parser.parser()
- start += limit
- db_conn.close()
- if __name__ == '__main__':
- main()
这段代码读取163邮箱未读邮件标题并将数据插入MysqL数据库,使用了expat进行数据处理。Login163类是爬虫这一节当中的类。