alpha v1.0.0

This commit is contained in:
Bigsk 2022-08-18 01:54:04 +08:00
parent cb8a9f690a
commit 6c2de65ffe
8 changed files with 228 additions and 1 deletions

8
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

12
.idea/Piuka.iml Normal file
View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
</module>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
.idea/misc.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/Piuka.iml" filepath="$PROJECT_DIR$/.idea/Piuka.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

View File

@ -1,3 +1,6 @@
# Piuka
Piuka Download Engine with Python
Piuka(Ukrainian: Річка Richka, Means River) Download Engine with Python
### Known bugs:
#### 1.When two identical files are downloaded consecutively using disk buffering mode, the second file is incomplete □

180
piuka.py Normal file
View File

@ -0,0 +1,180 @@
# -*- coding: utf-8 -*-
import random
import threading, time
import requests
class Piuka(object):
"""
Create a PiukaEngine object
:proxies -> dict: Proxies URI, like {"http": "http://localhost"}, see the doc for module "requests".
:headers -> dict: HTTP headers, like {"referer": "https://www.ghink.net"}, see the doc for module "requests".
:thread -> int : Thread num limit. Default value is you cpu core num.
:timeout -> int : Time limit for a single request, in seconds. Default value is 2.
:flush -> int : Threshold of data written to disk in Byte(s). Default value is 100MB.
"""
def __init__(self, **kwargs):
# Consts
self.__VERSION = ("Alpha", 1, 0, 0)
# Get proxies
self.__proxies = kwargs.get("proxies", None)
assert type(self.__proxies) == dict or self.__proxies is None
# Get headers
ua = "Piuka{}/{}.{}.{}".format(
self.__VERSION[0], self.__VERSION[1], self.__VERSION[2], self.__VERSION[3]
)
self.__headers = kwargs.get("headers", {"user-agent": ua})
assert type(self.__headers) == dict
# Use lowercase uniformly
for key, value in self.__headers.items():
self.__headers[key.lower()] = value
if key.lower() != key:
del self.__headers[key]
# Custom UA
if "user-agent" not in self.__headers:
self.__headers["user-agent"] = ua
# Get timeout
self.__timeout = kwargs.get("timeout", 2)
assert type(self.__timeout) == int
# Get thread num limit
self.__thread_num = kwargs.get("thread", 4) # need psutil to read cpu core count
# Threshold of data written to disk
self.__flush_size = kwargs.get("flush", (1024 ** 2) * 100)
# Download queue
self.__queue = []
# Cache pool
self.cache = {}
# Thread tasks pool
self.__thread = ["" for _ in range(self.__thread_num)]
# Tasks status pool
self.status = {}
# Start manager daemon thread
self.__manager_thread = threading.Thread(target=self.__manager, name="Piuka Thread Manager")
self.__manager_thread.daemon = True
self.__manager_thread.start()
def add(self, urls, dests, timeout=None):
"""
Add download task
:urls* -> str | list | tuple: URL(s) of the resource.
:dests* -> str | list | tuple: File's destination(s), need to correspond to the URL(s) in turn.
:timeout -> int: Time limit for a single request, in seconds.
"""
assert type(urls) in (list, tuple, str)
assert type(dests) in (list, tuple, str)
if type(urls) in (list, tuple) and type(dests) in (list, tuple):
assert len(urls) == len(dests)
if timeout is None:
timeout = self.__timeout
assert type(timeout) == int
# Construct a traversable object
if type(urls) == str and type(dests) == str:
urls = (urls,)
dests = (dests,)
queue = []
for i in range(len(urls)):
task_id = "{}-{}".format(time.time(), random.randint(1, 1000))
queue.append((urls[i], dests[i], task_id))
self.__queue.extend(queue)
return queue
def __manager(self):
while True:
for i in range(self.__thread_num):
if not self.__thread[i] and self.__queue:
config = []
config.extend(self.__queue.pop(0))
config.append(i)
config = tuple(config)
self.__thread[i] = threading.Thread(target=self.__worker, args=config,
name="Piuka Worker Thread #{}".format(i + 1))
self.__thread[i].start()
time.sleep(0.1)
def __worker(self, url, dest, task_id, i):
if dest != "::memory::":
with open(dest, "wb") as file:
error_count = 0
while True:
try:
if error_count >= 10:
self.status[task_id] = (False, e)
self.__thread[i] = ""
return
else:
file_header = requests.head(url, headers=self.__headers, timeout=self.__timeout)
except Exception as e:
error_count += 1
else:
if file_header.status_code // 100 == 2:
break
else:
e = "http failed"
error_count += 1
# Calc slice size
length = int(file_header.headers["content-length"])
slice_count = length // self.__flush_size
if not slice_count:
slice_count = 1
for j in range(0, length, length // slice_count):
header = self.__headers
next_size = j + (length // slice_count) - 1
if next_size > length:
next_size = length
header["range"] = "bytes={}-{}".format(j, next_size)
error_count = 0
while True:
try:
if error_count >= 10:
self.status[task_id] = (False, e)
self.__thread[i] = ""
return
else:
file_object = requests.get(url, headers=header, timeout=self.__timeout)
except Exception as e:
error_count += 1
else:
if file_object.status_code // 100 == 2:
break
else:
e = "http failed"
error_count += 1
file.write(file_object.content)
self.status[task_id] = (True, None)
self.__thread[i] = ""
elif dest == "::memory::":
error_count = 0
while True:
try:
if error_count >= 10:
self.status[task_id] = (False, e)
self.__thread[i] = ""
return
else:
file_object = requests.get(url, headers=self.__headers, timeout=self.__timeout)
except Exception as e:
error_count += 1
else:
if file_object.status_code // 100 == 2:
break
else:
e = "http failed"
error_count += 1
self.cache[url] = file_object.content
self.status[task_id] = (True, None)
self.__thread[i] = ""