xml数据处理--expat模块使用

前端之家收集整理的这篇文章主要介绍了xml数据处理--expat模块使用前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。

  1. # coding:utf-8
  2. from login163 import *
  3. from xml.parsers import expat
  4. import MysqLdb
  5. class mail163(Login163):
  6. '''
  7. get 'limit' unread mails at once,the data format is xml
  8. if 'subject' in xml data,then return the data,else return None
  9. '''
  10. def get_unread_mail(self,start,limit):
  11. postdata = {
  12. 'var':'<?xml version="1.0"?><object><int name="fid">1</int><boolean name="skipLockedFolders">false</boolean><string name="order">date</string><boolean name="desc">true</boolean><int name="start">'+str(start)+'</int><int name="limit">'+str(limit)+'</int><boolean name="topFirst">false</boolean><object name="filterFlags"><boolean name="read">false</boolean></object><boolean name="returnTotal">true</boolean><boolean name="returnTag">true</boolean></object>'
  13. }
  14. postdata = urllib.urlencode(postdata)
  15. url = 'http://twebmail.mail.163.com/js5/s?sid='+self.sid+'&func=mBox:listMessages&deftabclick=t2&deftabclick=undefined&from=toolbar&type=unread&mBoxentry=1'
  16. req = urllib2.Request(url=url,data=postdata,headers=self.headers)
  17. res = urllib2.urlopen(req).read()
  18. if 'subject' in res:
  19. return res
  20. else:
  21. return None
  22. '''
  23. xml data format,then return the data
  24. '''
  25. def format(self,xml_data):
  26. pattern = re.compile(r'<object name="ctrls">.*?</object>|<object name="flags" />|<object name="flags">.*?</object>',re.S)
  27. xml_data = pattern.sub('',xml_data)
  28. pattern = re.compile(r'<string name="from">.*?;(.*?@.*?)&.*?</string>')
  29. xml_data = pattern.sub(r'<string name="from">\1</string>',xml_data)
  30. pattern = re.compile(r'<string name="to">.*?;(.*?@.*?)&.*?</string>')
  31. xml_data = pattern.sub(r'<string name="to">\1</string>',xml_data)
  32. return xml_data
  33. #db connect
  34. class Db_Connect(object):
  35. def __init__(self,db_host,user,pwd,db_name,charset="utf8",use_unicode = True):
  36. try:
  37. self.conn = MysqLdb.Connection(db_host,charset=charset,use_unicode=use_unicode)
  38. except MysqLdb.OperationalError,e:
  39. print 'Connect %s Failed' % db_host
  40. print e.args
  41. sys.exit(1)
  42. def insert(self,sql):
  43. try:
  44. n = self.conn.cursor().execute(sql)
  45. return n
  46. except MysqLdb.Warning,e:
  47. print e.args
  48. except MysqLdb.IntegrityError,e:
  49. print e.args
  50. def close(self):
  51. self.conn.close()
  52. class Mail_Handler(object):
  53. def __init__(self,data,db_conn):
  54. self.flag = False # control the data update
  55. self.mail = {} # a mail info
  56. self.curr_attrib = ''
  57. self.data = data # xml data
  58. self.db_conn = db_conn
  59. def start(self,name,attributes):
  60. if name == 'object':
  61. self.mail = {}
  62. # get the value of the attribute
  63. # <string name="id">sdosod0sdfsd</string>
  64. # the value is "id"
  65. values = attributes.values()
  66. if len(values):
  67. self.curr_attrib = values[0]
  68. self.flag = True
  69. def end(self,name):
  70. sql = "insert into mails(id,from_mail,to_mail,subject,size) values('%s','%s',%d)"
  71. fields = ('id','from','to','subject','size')
  72. if name == 'object':
  73. #print self.mail
  74. values = [self.mail[i] for i in fields]
  75. values[-1] = int(values[-1]) # the size type is int
  76. values = tuple(values)
  77. #print values
  78. #print sql % values
  79. self.db_conn.insert(sql % values)
  80. self.flag = False
  81. def character(self,data):
  82. if self.flag:
  83. self.mail[self.curr_attrib] = data
  84. def parser(self):
  85. p = expat.ParserCreate()
  86. p.StartElementHandler = self.start
  87. p.EndElementHandler = self.end
  88. p.CharacterDataHandler = self.character
  89. p.Parse(self.data) # parse xml data
  90. def main():
  91. flag = True
  92. db_conn = Db_Connect('192.168.110.142','admin','test')
  93. username = raw_input('Enter you email:')
  94. password = getpass.getpass('Enter you password:')
  95. login = mail163(username,password)
  96. sid = login.login() # login the 163 mail for getting sid
  97. # login success
  98. if sid:
  99. start = 0 # the start page
  100. limit = 5 # read 5 unread mails at once
  101. while flag:
  102. res = login.get_unread_mail(start,limit)
  103. if res is None:
  104. flag = False
  105. else:
  106. res = login.format(res) # use re module format data
  107. parser = Mail_Handler(res,db_conn) # use expat parse xml
  108. parser.parser()
  109. start += limit
  110. db_conn.close()
  111. if __name__ == '__main__':
  112. main()




这段代码读取163邮箱未读邮件标题并将数据插入MysqL数据库,使用了expat进行数据处理。Login163类是爬虫这一节当中的类。

猜你在找的XML相关文章