运行此抓取工具时,由于内存错误,它会在随机时间继续失败(请参见下面的错误消息)。由于内存保持不变,因此使用objgraph
打印内存使用情况没有任何线索。我的直觉告诉我,这个问题来自重复的Pandas.to_sql()
语句,因为我可以做的最后打印是在to_sql
调用(See this GitHub issue)之前。
错误消息:`/ home / user / Projects / scrape_env / bin / python'中的错误:free():下一个大小无效(快速):0x0000000002ae7340 中止
# coding: utf8
import pandas as pd
import time
import datetime
import traceback
from bs4 import BeautifulSoup
import requests
import random
from utils import get_engine
def requesturl(url):
try:
time.sleep(random.randint(1,5))
resp = requests.get(url)
if 'please refer to the MWS Subscription API' in resp.text:
print(resp.text)
print('Banned!! Sleeping... 30s',url)
time.sleep(random.randint(10,30))
i = 10
requesturl(url)
except:
traceback.print_exc()
resp = ''
return BeautifulSoup(resp.text,"lxml")
def filter_already_scraped(url_list):
try:
already_scraped = pd.read_sql('select PageURL from AmazonKindleUnlimitedScrape_temp',get_engine())
except:
already_scraped = pd.DataFrame(columns=['PageURL'])
position = pd.read_sql("select max(Position) from AmazonKindleUnlimitedScrape_temp",get_engine()).values[0][0]
if not position:
position = 0
url_list = [url for url in url_list if url not in already_scraped['PageURL'].tolist()]
return url_list,position
def get_max_pages(url):
resp = requesturl(url)
return int(resp.find('span',{'class': 'pagnDisabled'}).text.strip())
def create_urls(start_url):
max_pages = get_max_pages(start_url)
url_list = []
for page_num in range(1,max_pages+1):
url_list.append(start_url + "&page=" + str(page_num))
return url_list
def scrape_product_info(serp_element,position,url):
position += 1
product_info = dict.fromkeys(['ASIN','Title','Author'])
product_info['ProductURL'] = \
serp_element.find('a',{'class': 'a-link-normal a-text-normal'})['href'].split('/ref=')[0]
product_info['ASIN'] = product_info['ProductURL'].split('/dp/')[1].split('/')[0]
product_info['ProductURL'] = 'https://www.amazon.co.uk/dp/' + product_info['ASIN']
product_info['PageURL'] = url
product_info['Title'] = serp_element.find('h2',{
'class': 'a-size-medium s-inline s-access-title a-text-normal'}).text.strip()
product_info['Author'] = serp_element.find_all('a',{'class': 'a-link-normal a-text-normal'})[1].text.split('|')[0]
if '£' in product_info['Author']:
product_info['Author'] = \
serp_element.find_all('span',{'class': 'a-size-small a-color-secondary'})[3].text.split('|')[0]
product_info['Price'] = serp_element.find_all('span',{'class': 'a-size-base a-color-price s-price a-text-bold'})[
1].text.strip()
product_info['Position'] = position
if serp_element.find('span',{'class': 's-icon s-icon-kindle-unlimited'}):
product_info['isKindleUnlimited'] = True
else:
product_info['isKindleUnlimited'] = False
product_info['isPrime'] = True
product_info['timestamp'] = datetime.datetime.now()
product_info['Date'] = product_info['timestamp'].date()
return product_info
def scrape_serp(urls,position):
for url in urls:
resp = requesturl(url)
main_results_group = resp.find('div',{'id': 'mainResults'})
serp_group = main_results_group.find_all('li',{'class': 's-result-item'})
serp_data = []
for serp_element in serp_group:
product_info = scrape_product_info(serp_element,url)
serp_data.append(product_info)
pd.DataFrame(serp_data).to_sql('AmazonKindleUnlimitedScrape_temp',get_engine(),if_exists='append',index=False)
def prime_titles_scrape():
start_url = 'https://www.amazon.co.uk/b/ref=pr_br_rw_lp_mn?node=12730785031&storeType=ebooks&pageType=kindle'
url_list = create_urls(start_url)
url_list,position = filter_already_scraped(url_list)
scrape_serp(url_list,position)
prime_titles_scrape()
objgraph打印:
Memory usage: 120680 (kb)
dict 29589
function 29464
list 17002
tuple 12175
Tag 7609
weakref 6167
NavigableString 5138
cell 3961
getset_descriptor 3611
builtin_function_or_method 3346
None