聚合数据API爬虫:Python

前端之家收集整理的这篇文章主要介绍了聚合数据API爬虫:Python前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。
  1. from db import run_sql
  2. from config import start_urls
  3. import urllib2
  4. from bs4 import BeautifulSoup
  5. import sys
  6. reload(sys)
  7. sys.setdefaultencoding('utf-8')
  8. f=open('result2.txt','w+')
  9. sys.stdout=f
  10. def get_type_list(url):
  11. object=urllib2.urlopen(url)
  12. soup=BeautifulSoup(object.read(),'html.parser')
  13. cache=soup.find_all('dl')[0]
  14. cache=cache.find_all('dd')[1:-1]
  15. result=[]
  16. for item in cache:
  17. item.span.extract()
  18. c=[]
  19. c.append(item.a.string)
  20. c.append(url[:-5]+item.a['href'])
  21. result.append(c)
  22. return result
  23.  
  24. def get_API_list(url):
  25. object=urllib2.urlopen(url)
  26. soup=BeautifulSoup(object.read(),'html.parser')
  27. cache=soup.find_all('div',class_='juheapis_desc clearfix')
  28. result=[]
  29. for item in cache:
  30. c=[]
  31. item.img.extract()
  32. r=item.select('a')[1]
  33. c.append(r.string)
  34. c.append(url[:url.find('/',8)]+r['href'])
  35. result.append(c)
  36. cache=soup.select('.juheapi_next')
  37. if cache:
  38. if cache[0].has_attr('href'):
  39. u=url[:url.find('/',8)]+cache[0]['href']
  40. cc=[]
  41. cc=get_API_list(u)
  42. result.extend(cc)
  43. return result
  44.  
  45. def get_API_list_childrens(url):
  46. object=urllib2.urlopen(url)
  47. soup=BeautifulSoup(object.read(),'html.parser')
  48. cache=soup.select('.das_left a')
  49. result=[]
  50. for item in cache:
  51. c=[]
  52. c.append(item.string[item.string.index('.')+1:])
  53. c.append(url[:url.find('/',8)]+item['href'])
  54. result.append(c)
  55. return result
  56.  
  57. def get_API_info(url):
  58. object=urllib2.urlopen(url)
  59. soup=BeautifulSoup(object.read(),'html.parser')
  60. cache=soup.select('.simpleline')
  61. cache=cache[:4]
  62. result=[]
  63. for item in cache:
  64. c=[]
  65. c.append(item.strong.string[:-1])
  66. c.append(item.span.string)
  67. result.append(c)
  68. return result
  69.  
  70.  
  71. if __name__ == '__main__':
  72. for item in get_type_list(start_urls[0]):
  73. print item[0]
  74. urls=get_API_list(item[1])
  75. for item2 in urls:
  76. print '----',item2[0],item2[1]
  77. for item3 in get_API_list_childrens(item2[1]):
  78. print '---- ----',item3[0],item3[1]
  79. for item4 in get_API_info(item3[1]):
  80. print '---- ---- ----',item4[0],item4[1]

配置文件:

  1. start_urls=['https://www.juhe.cn/docs']

使用说明:
1.本程序可自动爬去聚合数据API,可根据需求自己定制
2.程序使用BeautifulSoup模块,在使用前需要安装 BeautifulSoup4

  1. pip install bs4

猜你在找的设计模式相关文章