Python抓取beautifulsoup

你们能帮帮我吗,我花了一些时间试图运行这段代码,但是它给了我空白的输出。我尝试使用其他正则表达式,但无济于事。您的帮助将不胜感激。

注意:通过搜索google来存储网页的所有结果并抓取相关的电子邮件和电话号码,可将代码写入到抓取数据中。

from bs4 import BeautifulSoup
import requests
import pandas as pd
from urllib2 import urlopen,urlparse,Request,HTTPError
import urllib2
import re
import numpy as np
import csv
from httplib import BadStatusLine
import ssl
import json

class Google:
    @classmethod
    def search1(self,search):
      url_list = []   #store all the extracted urls in a List
      title_list = [] #store all the extracted titles in a List
      description_list = []  #store all the extracted Description in a List
      print url_list

      for start in range(0,10):
        page = requests.get('http://www.google.com/search?q='+search+str(start*10),verify = False)
        soup = BeautifulSoup(page.content)
        for cite in soup.findAll('cite'): #extract all URLs
            url = cite.text
            print url
            if not urlparse.urlparse(url).scheme: #check if url has prefix http:// or not
                url = 'http://'+url
                print url
            url_list.append(url.replace('https://','http://'))

        for tit in soup.findAll('h3',attrs={'class':'r'}): #extract all Titles
            print tit.text
            title_list.append(tit.text)

        for descr in soup.findAll('span',attrs={'class':'st'}): #extraxt all description
            print descr.text
            description_list.append(descr.text)

      record_list = [list(item) for item in list(zip(url_list,title_list,description_list))] #join all the lists
      df = pd.DataFrame(record_list,columns=['URL','Title','Description'])
      df.to_csv('result_url_topic_desc.csv',index=False,encoding='utf-8')
      with open('result_url_topic_desc.csv') as f:
           reader = csv.DictReader(f)
           rows = list(reader)
      with open('result_url_topic_desc_JSON.json','w') as f:
           json.dump(rows,f,sort_keys=False,indent=4,separators=(',',': '),encoding='utf-8') 


user_input = raw_input("Enter your search string : ")
Google.search1(user_input) # user search string
#Google.search1('cloud managed services') # user search string,it could be anything the user types

df2=pd.DataFrame()
df2 = pd.read_csv('result_url_topic_desc.csv',encoding='utf-8')
phn_1 = []    #store all the extracted Phn numbers in a List
mail_1 = []    #store all the extracted E-mail in a List
for row in df2.iterrows():  # Parse through each url in the list.
    try:
        try:
           req1 = Request(row[1]['URL'],headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/55.0.2883.75 Safari/537.36'})
           gcontext = ssl.SSLContext(ssl.PROTOCOL_SSLv23) # Bypass SSL certification verification
           f = urlopen(req1,context=gcontext)
           url_name = f.geturl() #extract URL name 
           s = f.read()
           phone = re.findall(r"((?:\d{3}|\(\d{3}\))?(?:\s|-|\.)?\d{3}(?:\s|-|\.)\d{4})",s)  # Phone regex
           emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,3}",s)  #Email regex
           #emails = re.findall(r"^[A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+\.[a-zA-Z]*$",s)

           if len(phone) == 0:
              print("No phone number found.")
              err_msg_phn = "No phone number found."
              phn_1.append((url_name,err_msg_phn))

           else:
               count = 1
               for item in phone:
                   phn_1.append((url_name,item))
                   count += 1
               print(phn_1)

           if len(emails) == 0:
              print("No email address found.")
              err_msg_mail = "No email address found."
              mail_1.append((url_name,err_msg_mail))

           else:
               count = 1
               for item in emails:
                   mail_1.append((url_name,item))
                   count += 1
               print(mail_1)

        except BadStatusLine: # Catch if invalid url names exist
            print("could not fetch %s" % url_name)

    except urllib2.HTTPError as err: # catch HTTP 404 not found error
        if err == 404:
            print("Received HTTPError on %s" % url_name)


df_p = pd.DataFrame()
df_m = pd.DataFrame()
df_final = pd.DataFrame()

df_p = pd.DataFrame(phn_1,'Phone_No']) # Dataframe for url and Phn number
df_phn = df_p.drop_duplicates(subset=['URL','Phone_No'],keep='first') #remove duplicates

df_m = pd.DataFrame(mail_1,'Email']) # Dataframe for url and Email
df_mail = df_m.drop_duplicates(subset=['URL','Email'],keep='first') #remove duplicates

df_final = pd.merge(df_phn,df_mail,on = 'URL',how = 'inner') #Merge two dataframes on the common column
#df_final.groupby(['URL'],as_index=False)
df_final.to_csv('result_contact.csv',encoding='utf-8')

#convert the csv output to json
with open('result_contact.csv') as f:
     reader = csv.DictReader(f)
     rows = list(reader)
with open('result_contact_JSON.json','w') as f: 
   json.dump(rows,encoding='utf-8')
gjlgjlgjlgjl 回答:Python抓取beautifulsoup

暂时没有好的解决方案,如果你有好的解决方案,请发邮件至:iooj@foxmail.com
本文链接:https://www.f2er.com/3092360.html

大家都在问