无法修改函数以独立工作，而不是依赖于返回的结果

2024-04-29 • 问答

我已经在python中使用代理编写了一个脚本，同时将请求发送到某些链接，以便从那里解析产品名称。我目前的尝试完美地完成了这项工作。此功能parse_product()完全取决于返回的结果（代理），以便以正确的方式重用同一代理。我试图以这种方式修改parse_product()函数，以便该函数不依赖于先前对同一函数的调用，以便重复使用有效的代理直到无效。更清楚地说-我期望主要功能如下所示。但是，完成解决后，我将使用多重处理来使脚本运行更快：

if __name__ == '__main__':
    for url in linklist:
        parse_product(url)

并且仍然希望脚本能够像现在一样工作。

我尝试过（使用一个）：

import random
import requests
from random import choice
from urllib.parse import urljoin
from bs4 import BeautifulSoup

linklist = [
    'https://www.amazon.com/dp/B00OI0RGGO','https://www.amazon.com/dp/B00TPKOPWA','https://www.amazon.com/dp/B00TH42HWE' 
]

proxyVault = ['103.110.37.244:36022','180.254.218.229:8080','110.74.197.207:50632','1.20.101.95:49001','200.10.193.90:8080','173.164.26.117:3128','103.228.118.66:43002','178.128.231.201:3128','1.2.169.54:55312','181.52.85.249:31487','97.64.135.4:8080','190.96.214.123:53251','52.144.107.142:31923','45.5.224.145:52035','89.218.22.178:8080','192.241.143.186:80','113.53.29.218:38310','36.78.131.182:39243']

def process_proxy(proxy):
    global proxyVault
    if not proxy:
        proxy_url = choice(proxyVault)
        proxy = {'https': f'http://{proxy_url}'}
    else:
        proxy_pattern = proxy.get("https").split("//")[-1]
        if proxy_pattern in proxyVault:
            proxyVault.remove(proxy_pattern)
        random.shuffle(proxyVault)
        proxy_url = choice(proxyVault)
        proxy = {'https': f'http://{proxy_url}'}
    return proxy


def parse_product(link,proxy):
    try:
        if not proxy:raise
        print("checking the proxy:",proxy)
        res = requests.get(link,proxies=proxy,timeout=5)
        soup = BeautifulSoup(res.text,"html5lib")
        try:
            product_name = soup.select_one("#productTitle").get_text(strip=True)
        except Exception: product_name = ""

        return proxy,product_name

    except Exception:
        """the following line when hit produces new proxy and remove the bad one that passes through process_proxy(proxy)"""
        proxy_link = process_proxy(proxy)
        return parse_product(link,proxy_link)


if __name__ == '__main__':
    proxy = None
    for url in linklist:
        result = parse_product(url,proxy)
        proxy = result[0]
        print(result)

注意：parse_product()函数返回代理和产品名称。但是，函数返回的代理将在同一函数parse_product()中重用，直到无效。

顺便说一句，proxyVault中使用的代理只是占位符。

laolvye 回答：无法修改函数以独立工作，而不是依赖于返回的结果

如果您不需要多线程支持（您的编辑建议您不需要），则可以对它进行以下较小更改。 proxyVault在整理列表（您的代码同时包含shuffle和choice之后，保留整个代理池，和活动代理（最后一个），但是仅其中之一就足够了）。从列表中pop()-更改活动代理，直到没有剩余。

import random
import requests
from random import choice
from urllib.parse import urljoin
from bs4 import BeautifulSoup

linklist = [
    'https://www.amazon.com/dp/B00OI0RGGO','https://www.amazon.com/dp/B00TPKOPWA','https://www.amazon.com/dp/B00TH42HWE'
]

proxyVault = ['103.110.37.244:36022','180.254.218.229:8080','110.74.197.207:50632','1.20.101.95:49001','200.10.193.90:8080','173.164.26.117:3128','103.228.118.66:43002','178.128.231.201:3128','1.2.169.54:55312','181.52.85.249:31487','97.64.135.4:8080','190.96.214.123:53251','52.144.107.142:31923','45.5.224.145:52035','89.218.22.178:8080','192.241.143.186:80','113.53.29.218:38310','36.78.131.182:39243']
random.shuffle(proxyVault)


class NoMoreProxies(Exception):
    pass


def skip_proxy():
    global proxyVault
    if len(proxyVault) == 0:
        raise NoMoreProxies()
    proxyVault.pop()


def get_proxy():
    global proxyVault
    if len(proxyVault) == 0:
        raise NoMoreProxies()
    proxy_url = proxyVault[-1]
    proxy = {'https': f'http://{proxy_url}'}
    return proxy


def parse_product(link):
    try:
        proxy = get_proxy()
        print("checking the proxy:",proxy)
        res = requests.get(link,proxies=proxy,timeout=5)
        soup = BeautifulSoup(res.text,"html5lib")
        try:
            product_name = soup.select_one("#productTitle").get_text(strip=True)
        except Exception:
            product_name = ""

        return product_name

    except Exception:
        """the following line when hit produces new proxy and remove the bad one that passes through process_proxy(proxy)"""
        skip_proxy()
        return parse_product(link)


if __name__ == '__main__':
    for url in linklist:
        result = parse_product(url)
        print(result)

我还建议更改最后一个try / except子句以捕获RequestException而不是Exception。

也许您可以将代理处理逻辑放在类中，然后将实例传递给parse_product()。然后，parse_product()将调用实例的必要方法来获取和/或重置代理。该类可以如下所示：

class ProxyHandler:
    proxyVault = [
        "103.110.37.244:36022","180.254.218.229:8080" # and so on
    ]

    def __init__(self,*args,**kwargs):
        super().__init__(*args,**kwargs)
        # Initialize proxy
        proxy_url = choice(self.proxyVault)
        self.proxy = {"https": f"http://{proxy_url}"}

    def get_proxy(self):
        return self.proxy

    def renew_proxy(self):
        # Remove current proxy from the vault
        proxy_pattern = self.proxy.get("https").split("//")[-1]
        if proxy_pattern in proxyVault:
            proxyVault.remove(proxy_pattern)

        # Set new proxy
        random.shuffle(proxyVault)
        proxy_url = choice(self.proxyVault)
        self.proxy = {"https": f"http://{proxy_url}"}

然后，parse_product()可能看起来像这样：

def parse_product(link,proxy_handler):
    try:
        if not proxy_handler:
            raise
        proxy = proxy_handler.get_proxy()
        print("checking the proxy:","html5lib")
        try:
            product_name = soup.select_one("#productTitle").get_text(strip=True)
        except Exception:
            product_name = ""

        return product_name

    except Exception:
        """the following line when hit produces new proxy and remove the bad one that passes through process_proxy(proxy)"""
        proxy_handler.renew_proxy()
        return parse_product(link,proxy_handler)

我认为您可以将相同的ProxyHandler实例传递给所有线程并进行并行化。

我可能在这里遗漏了一些关键的内容（因为已经很晚了），但这似乎是一个非常复杂的简单问题。这几乎是一个 XY问题。我要发表一些想法，问题（我的游荡），观察和建议：

最终目标是访问每个链接（一次或尽可能多的访问？如果是后者，则似乎是一次 DoS 尝试，所以我'将假定它是前者:)）使用每个代理（当代理失败时，移至下一个）。如果可行，请获取一些产品（似乎是某种电动机）的名称
为什么递归？它受堆栈的限制（在{em> Python 中受[Python 3.Docs]: sys.getrecursionlimit()限制）
如果不给变量赋值，则无需将变量声明为 global （有例外，但我认为不是这种情况）
process_proxy （问题变体）在 proxyVault 为空时表现不佳
global proxy（答案）很丑
为什么随机而不是简单地从列表中选择下一个代理？
parse_product_info （ parse_product ）的行为不一致，在某些情况下返回某些内容，在其他情况下则返回
并行化仅发生在目标 URL 级别。如果也可以在代理级别工作，则可以进行更多改进（但需要在代码中添加更多逻辑）

下面是简化（更简洁）的版本。

code00.py ：

#!/usr/bin/env python3

import sys
import random
import requests
from bs4 import BeautifulSoup


urls = [
    "https://www.amazon.com/dp/B00OI0RGGO","https://www.amazon.com/dp/B00TPKOPWA","https://www.amazon.com/dp/B00TH42HWE","https://www.amazon.com/dp/B00TPKNREM",]

proxies = [
    "103.110.37.244:36022","180.254.218.229:8080","110.74.197.207:50632","1.20.101.95:49001","200.10.193.90:8080","173.164.26.117:3128","103.228.118.66:43002","178.128.231.201:3128","1.2.169.54:55312","181.52.85.249:31487","97.64.135.4:8080","190.96.214.123:53251","52.144.107.142:31923","45.5.224.145:52035","89.218.22.178:8080","192.241.143.186:80","113.53.29.218:38310","36.78.131.182:39243"
]


def parse_product_info(link):  # Can also pass proxies as argument
    local_proxies = proxies[:]  # Make own copy of the global proxies (in case you want to shuffle them and not affect other parallel processing workers)
    #random.shuffle(local_proxies)  # Makes no difference,but if you really want to shuffle it,decomment this line
    for proxy in local_proxies:
        try:
            proxy_dict = {"https": f"http://{proxy}"}  # http or https?
            print(f"    Proxy to be used: {proxy_dict['https']}")
            response = requests.get(link,proxies=proxy_dict,timeout=5)
            if not response:
                print(f"    HTTP request returned {response.status_code} code")
                continue  # Move to next proxy
            soup = BeautifulSoup(response.text,"html5lib")
            try:
                product_name = soup.select_one("#productTitle").get_text(strip=True)
                return product_name  # Information retrieved,return it.
            except Exception as e:  # Might want to use specific exceptions
                print(f"ERROR: {e}")
                # URL was accessible,but the info couldn't be parsed.
                # return,as probably it will be the same using any other proxies.
                return None  # Replace by `continue` if  you want to try the other proxies
        except Exception as e:
            #print(f"    {e}")
            continue  # Some exception occured,move to next proxy


def main():
    for url in urls:
        print(f"\nAttempting url: {url}...")
        product_name = parse_product_info(url)
        if product_name:
            print(f"{url} yielded product name:\n[{product_name}\\n")


if __name__ == "__main__":
    print("Python {0:s} {1:d}bit on {2:s}\n".format(" ".join(item.strip() for item in sys.version.split("\n")),64 if sys.maxsize > 0x100000000 else 32,sys.platform))
    main()
    print("\nDone.")

输出（部分，因为我不允许它通过所有代理/ URL s）：

[cfati@CFATI-5510-0:e:\Work\Dev\StackOverflow\q058796837]> "e:\Work\Dev\VEnvs\py_064_03.07.03_test0\Scripts\python.exe" code00.py
Python 3.7.3 (v3.7.3:ef4ec6ed12,Mar 25 2019,22:22:05) [MSC v.1916 64 bit (AMD64)] 64bit on win32


Attempting url: https://www.amazon.com/dp/B00OI0RGGO...
    Proxy to be used: http://103.110.37.244:36022
    Proxy to be used: http://180.254.218.229:8080
    Proxy to be used: http://110.74.197.207:50632
    Proxy to be used: http://1.20.101.95:49001
    Proxy to be used: http://200.10.193.90:8080
    Proxy to be used: http://173.164.26.117:3128
    ...

function multiprocessing python-3.x web-scraping

本文链接：https://www.f2er.com/3126047.html

无法修改函数以独立工作，而不是依赖于返回的结果

laolvye 回答：无法修改函数以独立工作，而不是依赖于返回的结果

大家都在问