反复Pandas.to_sql()调用后Python“异常终止”

运行此抓取工具时,由于内存错误,它会在随机时间继续失败(请参见下面的错误消息)。由于内存保持不变,因此使用objgraph打印内存使用情况没有任何线索。我的直觉告诉我,这个问题来自重复的Pandas.to_sql()语句,因为我可以做的最后打印是在to_sql调用(See this GitHub issue)之前。

  

错误消息:`/ home / user / Projects / scrape_env / bin / python'中的错误:free():下一个大小无效(快速):0x0000000002ae7340   中止

# coding: utf8
import pandas as pd
import time
import datetime
import traceback
from bs4 import BeautifulSoup
import requests
import random
from utils import get_engine


def requesturl(url):
    try:
        time.sleep(random.randint(1,5))
        resp = requests.get(url)
        if 'please refer to the MWS Subscription API' in resp.text:
            print(resp.text)
            print('Banned!! Sleeping... 30s',url)
            time.sleep(random.randint(10,30))
            i = 10
            requesturl(url)
    except:
        traceback.print_exc()
        resp = ''

    return BeautifulSoup(resp.text,"lxml")


def filter_already_scraped(url_list):
    try:
        already_scraped = pd.read_sql('select PageURL from AmazonKindleUnlimitedScrape_temp',get_engine())
    except:
        already_scraped = pd.DataFrame(columns=['PageURL'])

    position = pd.read_sql("select max(Position) from AmazonKindleUnlimitedScrape_temp",get_engine()).values[0][0]
    if not position:
        position = 0

    url_list = [url for url in url_list if url not in already_scraped['PageURL'].tolist()]
    return url_list,position


def get_max_pages(url):
    resp = requesturl(url)
    return int(resp.find('span',{'class': 'pagnDisabled'}).text.strip())


def create_urls(start_url):
    max_pages = get_max_pages(start_url)
    url_list = []
    for page_num in range(1,max_pages+1):
        url_list.append(start_url + "&page=" + str(page_num))

    return url_list


def scrape_product_info(serp_element,position,url):
    position += 1
    product_info = dict.fromkeys(['ASIN','Title','Author'])

    product_info['ProductURL'] = \
    serp_element.find('a',{'class': 'a-link-normal a-text-normal'})['href'].split('/ref=')[0]
    product_info['ASIN'] = product_info['ProductURL'].split('/dp/')[1].split('/')[0]
    product_info['ProductURL'] = 'https://www.amazon.co.uk/dp/' + product_info['ASIN']
    product_info['PageURL'] = url

    product_info['Title'] = serp_element.find('h2',{
        'class': 'a-size-medium s-inline s-access-title a-text-normal'}).text.strip()

    product_info['Author'] = serp_element.find_all('a',{'class': 'a-link-normal a-text-normal'})[1].text.split('|')[0]
    if '£' in product_info['Author']:
        product_info['Author'] = \
        serp_element.find_all('span',{'class': 'a-size-small a-color-secondary'})[3].text.split('|')[0]

    product_info['Price'] = serp_element.find_all('span',{'class': 'a-size-base a-color-price s-price a-text-bold'})[
        1].text.strip()

    product_info['Position'] = position
    if serp_element.find('span',{'class': 's-icon s-icon-kindle-unlimited'}):
        product_info['isKindleUnlimited'] = True
    else:
        product_info['isKindleUnlimited'] = False

    product_info['isPrime'] = True

    product_info['timestamp'] = datetime.datetime.now()
    product_info['Date'] = product_info['timestamp'].date()

    return product_info


def scrape_serp(urls,position):
    for url in urls:
        resp = requesturl(url)

        main_results_group = resp.find('div',{'id': 'mainResults'})
        serp_group = main_results_group.find_all('li',{'class': 's-result-item'})

        serp_data = []
        for serp_element in serp_group:
            product_info = scrape_product_info(serp_element,url)
            serp_data.append(product_info)

        pd.DataFrame(serp_data).to_sql('AmazonKindleUnlimitedScrape_temp',get_engine(),if_exists='append',index=False)


def prime_titles_scrape():
    start_url = 'https://www.amazon.co.uk/b/ref=pr_br_rw_lp_mn?node=12730785031&storeType=ebooks&pageType=kindle'
    url_list = create_urls(start_url)
    url_list,position = filter_already_scraped(url_list)
    scrape_serp(url_list,position)


prime_titles_scrape()

objgraph打印:

Memory usage: 120680 (kb)
dict                       29589
function                   29464
list                       17002
tuple                      12175
Tag                        7609
weakref                    6167
NavigableString            5138
cell                       3961
getset_descriptor          3611
builtin_function_or_method 3346
None
yishuitian1 回答:反复Pandas.to_sql()调用后Python“异常终止”

暂时没有好的解决方案,如果你有好的解决方案,请发邮件至:iooj@foxmail.com
本文链接:https://www.f2er.com/3166768.html

大家都在问