|
69
|
1
|
|
|
2 # 1. asynciously down list of urls
|
|
|
3 # 2. retries
|
|
|
4
|
|
|
5
|
|
|
6
|
|
|
7 import asyncio
|
|
|
8 from concurrent.futures import ThreadPoolExecutor
|
|
|
9 import random
|
|
|
10 from typing import List
|
|
|
11 import time
|
|
|
12
|
|
|
13 async def interface_download_url(url: str, retry_number = 0):
|
|
|
14 curr = time.time()
|
|
|
15 success = False
|
|
|
16 try:
|
|
|
17 res = await func_download_url(url)
|
|
|
18 success = True
|
|
|
19 except:
|
|
|
20 res = await interface_download_url(url, retry_number + 1)
|
|
|
21 res = None
|
|
|
22 long = curr - time.time()
|
|
|
23 return success, res, url, long
|
|
|
24
|
|
|
25 async def func_download_url(url: str):
|
|
|
26 await asyncio.sleep(random.randint(1, 10))
|
|
|
27
|
|
|
28 ALLOWED_BATCH_SIZE = 10
|
|
|
29 ALLOWED_RETRY_SIZE = 3
|
|
|
30 async def download_multi_urls(urls: List[str], retry_num = 0):
|
|
|
31 if retry_num > ALLOWED_RETRY_SIZE:
|
|
|
32 return
|
|
|
33
|
|
|
34 number_of_batch = len(urls) // ALLOWED_BATCH_SIZE + 1
|
|
|
35
|
|
|
36 errors = []
|
|
|
37 for batch_num in range(number_of_batch):
|
|
|
38 download_urls = urls[batch_num * ALLOWED_BATCH_SIZE:(batch_num + 1) * ALLOWED_BATCH_SIZE]
|
|
|
39 for download_url in download_urls:
|
|
|
40 asyncio.run(asyncio.create_task(interface_download_url(download_url)))
|
|
|
41 results = await asyncio.gather(*tasks)
|
|
|
42 for task in tasks
|
|
|
43 asyncio.run()
|
|
|
44 for result in results:
|
|
|
45 if not result[0]:
|
|
|
46 errors.append(result[2])
|
|
|
47
|
|
|
48 await download_multi_urls(errors, retry_num+1)
|