如果您不需要多线程支持(您的编辑建议您不需要),则可以对它进行以下较小更改。 proxyVault
在整理列表(您的代码同时包含shuffle
和choice
之后,保留整个代理池,和活动代理(最后一个),但是仅其中之一就足够了)。从列表中pop()
-更改活动代理,直到没有剩余。
import random
import requests
from random import choice
from urllib.parse import urljoin
from bs4 import BeautifulSoup
linklist = [
'https://www.amazon.com/dp/B00OI0RGGO','https://www.amazon.com/dp/B00TPKOPWA','https://www.amazon.com/dp/B00TH42HWE'
]
proxyVault = ['103.110.37.244:36022','180.254.218.229:8080','110.74.197.207:50632','1.20.101.95:49001','200.10.193.90:8080','173.164.26.117:3128','103.228.118.66:43002','178.128.231.201:3128','1.2.169.54:55312','181.52.85.249:31487','97.64.135.4:8080','190.96.214.123:53251','52.144.107.142:31923','45.5.224.145:52035','89.218.22.178:8080','192.241.143.186:80','113.53.29.218:38310','36.78.131.182:39243']
random.shuffle(proxyVault)
class NoMoreProxies(Exception):
pass
def skip_proxy():
global proxyVault
if len(proxyVault) == 0:
raise NoMoreProxies()
proxyVault.pop()
def get_proxy():
global proxyVault
if len(proxyVault) == 0:
raise NoMoreProxies()
proxy_url = proxyVault[-1]
proxy = {'https': f'http://{proxy_url}'}
return proxy
def parse_product(link):
try:
proxy = get_proxy()
print("checking the proxy:",proxy)
res = requests.get(link,proxies=proxy,timeout=5)
soup = BeautifulSoup(res.text,"html5lib")
try:
product_name = soup.select_one("#productTitle").get_text(strip=True)
except Exception:
product_name = ""
return product_name
except Exception:
"""the following line when hit produces new proxy and remove the bad one that passes through process_proxy(proxy)"""
skip_proxy()
return parse_product(link)
if __name__ == '__main__':
for url in linklist:
result = parse_product(url)
print(result)
我还建议更改最后一个try / except子句以捕获RequestException
而不是Exception
。
,
也许您可以将代理处理逻辑放在类中,然后将实例传递给parse_product()
。然后,parse_product()
将调用实例的必要方法来获取和/或重置代理。该类可以如下所示:
class ProxyHandler:
proxyVault = [
"103.110.37.244:36022","180.254.218.229:8080" # and so on
]
def __init__(self,*args,**kwargs):
super().__init__(*args,**kwargs)
# Initialize proxy
proxy_url = choice(self.proxyVault)
self.proxy = {"https": f"http://{proxy_url}"}
def get_proxy(self):
return self.proxy
def renew_proxy(self):
# Remove current proxy from the vault
proxy_pattern = self.proxy.get("https").split("//")[-1]
if proxy_pattern in proxyVault:
proxyVault.remove(proxy_pattern)
# Set new proxy
random.shuffle(proxyVault)
proxy_url = choice(self.proxyVault)
self.proxy = {"https": f"http://{proxy_url}"}
然后,parse_product()
可能看起来像这样:
def parse_product(link,proxy_handler):
try:
if not proxy_handler:
raise
proxy = proxy_handler.get_proxy()
print("checking the proxy:","html5lib")
try:
product_name = soup.select_one("#productTitle").get_text(strip=True)
except Exception:
product_name = ""
return product_name
except Exception:
"""the following line when hit produces new proxy and remove the bad one that passes through process_proxy(proxy)"""
proxy_handler.renew_proxy()
return parse_product(link,proxy_handler)
我认为您可以将相同的ProxyHandler
实例传递给所有线程并进行并行化。
,
我可能在这里遗漏了一些关键的内容(因为已经很晚了),但这似乎是一个非常复杂的简单问题。这几乎是一个 XY问题。我要发表一些想法,问题(我的游荡),观察和建议:
- 最终目标是访问每个链接(一次或尽可能多的访问?如果是后者,则似乎是一次 DoS 尝试,所以我'将假定它是前者:))使用每个代理(当代理失败时,移至下一个)。如果可行,请获取一些产品(似乎是某种电动机)的名称
- 为什么递归?它受堆栈的限制(在{em> Python 中受[Python 3.Docs]: sys.getrecursionlimit()限制)
- 如果不给变量赋值,则无需将变量声明为 global (有例外,但我认为不是这种情况)
- process_proxy (问题变体)在 proxyVault 为空时表现不佳
-
global proxy
(答案)很丑
- 为什么随机而不是简单地从列表中选择下一个代理?
- parse_product_info ( parse_product )的行为不一致,在某些情况下返回某些内容,在其他情况下则返回
- 并行化仅发生在目标 URL 级别。如果也可以在代理级别工作,则可以进行更多改进(但需要在代码中添加更多逻辑)
下面是简化(更简洁)的版本。
code00.py :
#!/usr/bin/env python3
import sys
import random
import requests
from bs4 import BeautifulSoup
urls = [
"https://www.amazon.com/dp/B00OI0RGGO","https://www.amazon.com/dp/B00TPKOPWA","https://www.amazon.com/dp/B00TH42HWE","https://www.amazon.com/dp/B00TPKNREM",]
proxies = [
"103.110.37.244:36022","180.254.218.229:8080","110.74.197.207:50632","1.20.101.95:49001","200.10.193.90:8080","173.164.26.117:3128","103.228.118.66:43002","178.128.231.201:3128","1.2.169.54:55312","181.52.85.249:31487","97.64.135.4:8080","190.96.214.123:53251","52.144.107.142:31923","45.5.224.145:52035","89.218.22.178:8080","192.241.143.186:80","113.53.29.218:38310","36.78.131.182:39243"
]
def parse_product_info(link): # Can also pass proxies as argument
local_proxies = proxies[:] # Make own copy of the global proxies (in case you want to shuffle them and not affect other parallel processing workers)
#random.shuffle(local_proxies) # Makes no difference,but if you really want to shuffle it,decomment this line
for proxy in local_proxies:
try:
proxy_dict = {"https": f"http://{proxy}"} # http or https?
print(f" Proxy to be used: {proxy_dict['https']}")
response = requests.get(link,proxies=proxy_dict,timeout=5)
if not response:
print(f" HTTP request returned {response.status_code} code")
continue # Move to next proxy
soup = BeautifulSoup(response.text,"html5lib")
try:
product_name = soup.select_one("#productTitle").get_text(strip=True)
return product_name # Information retrieved,return it.
except Exception as e: # Might want to use specific exceptions
print(f"ERROR: {e}")
# URL was accessible,but the info couldn't be parsed.
# return,as probably it will be the same using any other proxies.
return None # Replace by `continue` if you want to try the other proxies
except Exception as e:
#print(f" {e}")
continue # Some exception occured,move to next proxy
def main():
for url in urls:
print(f"\nAttempting url: {url}...")
product_name = parse_product_info(url)
if product_name:
print(f"{url} yielded product name:\n[{product_name}\\n")
if __name__ == "__main__":
print("Python {0:s} {1:d}bit on {2:s}\n".format(" ".join(item.strip() for item in sys.version.split("\n")),64 if sys.maxsize > 0x100000000 else 32,sys.platform))
main()
print("\nDone.")
输出(部分,因为我不允许它通过所有代理/ URL s):
[cfati@CFATI-5510-0:e:\Work\Dev\StackOverflow\q058796837]> "e:\Work\Dev\VEnvs\py_064_03.07.03_test0\Scripts\python.exe" code00.py
Python 3.7.3 (v3.7.3:ef4ec6ed12,Mar 25 2019,22:22:05) [MSC v.1916 64 bit (AMD64)] 64bit on win32
Attempting url: https://www.amazon.com/dp/B00OI0RGGO...
Proxy to be used: http://103.110.37.244:36022
Proxy to be used: http://180.254.218.229:8080
Proxy to be used: http://110.74.197.207:50632
Proxy to be used: http://1.20.101.95:49001
Proxy to be used: http://200.10.193.90:8080
Proxy to be used: http://173.164.26.117:3128
...
本文链接:https://www.f2er.com/3126047.html