您尝试使用的引用链接不起作用的原因是,您具有迭代的行,并且某些行没有锚标记href属性,因此它失败了。我提供了一个if条件进行检查。立即尝试。
import requests
from bs4 import BeautifulSoup
url = 'http://www.milavia.net/airshows/calendar/showdates-2020-world.html'
session=requests.session()
page = session.get(url)
soup = BeautifulSoup(page.content,'lxml')
tableOutput = []
for row in soup.find_all('tr')[1:]:
date,event,location,website,facebook,feature,notes = row.find_all('td')[0:7]
if website.select_one('a[href]'):
p = {
'Date': date.text.strip(),'Event': event.text.strip(),'Location': location.text.strip(),# 'Site': website.text.strip(),'Site': website.select_one('a[href]')['href'],'Facebook': facebook.text.strip(),'Featuring': feature.text.strip(),'Notes': notes.text.strip()
}
tableOutput.append(p)
print(tableOutput)
输出:
[{'Featuring': '','Location': 'Kuwait International Airport,Kuwait','Site': 'http://kuwaitaviationshow.com/','Date': '15-18 Jan','Facebook': '','Event': 'Kuwait Aviation Show','Notes': 'public 17-18'},{'Featuring': '','Location': 'Tauranga,New Zealand','Site': 'http://www.tcas.nz','Date': '18 Jan','Event': 'Classics of the Sky Tauranga City Airshow','Notes': ''},'Location': 'Lucknow,Uttar Pradesh,India','Site': 'https://defexpo.gov.in/','Date': '05-08 Feb','Event': 'Defexpo India 2020','Notes': 'public Sat. 8th'},'Location': 'Changi Exhibition Centre,Singapore','Site': 'http://www.singaporeairshow.com/','Date': '11-16 Feb','Event': 'Singapore Airshow 2020','Notes': 'public Sat-SunReports: 2018 2014'},'Location': 'Al Bateen Executive Airport,Abu Dhabi,United Arab Emirates','Site': 'http://www.adairexpo.com/','Date': '04-06 Mar','Event': 'Abu Dhabi Air Expo & Heli Expo 2020','Notes': 'trade expo'},'Location': "Djerba–Zarzis Int'l Airport,Djerba,Tunisia",'Site': 'http://www.iadetunisia.com/en/','Date': '04-08 Mar','Event': 'IADE Tunisia 2020','Notes': 'public days 7-8'},'Location': 'Tyabb Airport,Tyabb VIC,Australia','Site': 'http://www.tyabbairshow.com/','Date': '08 Mar','Event': 'Tyabb Air Show 2020','Location': 'Echuca Airport,Echuca VIC,'Site': 'http://www.antique-aeroplane.com.au/','Date': '20-22 Mar','Event': 'AAAA National Fly-in','Location': "Santiago Int'l Airport,Santiago,Chile",'Site': 'http://www.fidae.cl/','Date': '31 Mar / 05 Apr','Event': 'FIDAE 2020','Notes': 'public Apr 4-5'},'Location': 'Wanaka Airport,Otago,'Site': 'http://www.warbirdsoverwanaka.com/','Date': '11-13 Apr','Event': 'Warbirds Over Wanaka 2020','Notes': 'Report 2010'},'Location': 'Illawarra Regional Airport,Wollongong NSW,'Site': 'http://www.woi.org.au/','Date': '02-03 May','Event': 'Wings over Illawarra','Location': 'AFB Waterkloof,Centurion,South Africa','Site': 'http://www.aadexpo.co.za/','Date': '16-20 Sep','Event': 'Africa Aerospace & Defence - AAD 2020','Notes': 'public 19-20'},'Location': 'JIExpo Kemayoran,Jakarta,Indonesia','Site': 'http://www.indoaerospace.com/','Date': '04-07 Nov','Event': 'Indo Aerospace 2020','Notes': 'trade only'},'Location': 'Zhuhai,Guangdong,China','Site': 'http://www.airshow.com.cn/','Date': '10-15 Nov','Event': 'Airshow China 2020','Notes': 'public 13-15th'},'Location': 'Sakhir Air Base,Bahrain','Site': 'http://www.bahraininternationalairshow.com/','Date': '18-20 Nov','Event': 'Bahrain International Airshow BIAS 2020','Notes': ''}]
,
使用CSS进行过滤。使用bs4 4.7.1,可以确保使用:has
仅处理包含那些链接的行。这减少了代码行,并消除了索引编制的需要。如果您使用select
,则可以利用limit
参数。
import requests
from bs4 import BeautifulSoup
url = 'http://www.milavia.net/airshows/calendar/showdates-2020-world.html'
page = requests.get(url)
soup = BeautifulSoup(page.content,'lxml')
tableOutput = []
for row in soup.select('tr:has(.asclnk[href])'):
date,notes = row.select('td',limit=7)
p = {
'Date': date.text.strip(),'Notes': notes.text.strip()
}
tableOutput.append(p)
print(tableOutput)