alpha v1.0.0
This commit is contained in:
parent
cb8a9f690a
commit
6c2de65ffe
8
.idea/.gitignore
vendored
Normal file
8
.idea/.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
12
.idea/Piuka.iml
Normal file
12
.idea/Piuka.iml
Normal file
@ -0,0 +1,12 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyDocumentationSettings">
|
||||
<option name="format" value="PLAIN" />
|
||||
<option name="myDocStringFormat" value="Plain" />
|
||||
</component>
|
||||
</module>
|
6
.idea/inspectionProfiles/profiles_settings.xml
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
Normal file
@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
4
.idea/misc.xml
Normal file
4
.idea/misc.xml
Normal file
@ -0,0 +1,4 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK" />
|
||||
</project>
|
8
.idea/modules.xml
Normal file
8
.idea/modules.xml
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/Piuka.iml" filepath="$PROJECT_DIR$/.idea/Piuka.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
6
.idea/vcs.xml
Normal file
6
.idea/vcs.xml
Normal file
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
@ -1,3 +1,6 @@
|
||||
# Piuka
|
||||
|
||||
Piuka Download Engine with Python
|
||||
Piuka(Ukrainian: Річка Richka, Means River) Download Engine with Python
|
||||
|
||||
### Known bugs:
|
||||
#### 1.When two identical files are downloaded consecutively using disk buffering mode, the second file is incomplete □
|
180
piuka.py
Normal file
180
piuka.py
Normal file
@ -0,0 +1,180 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import random
|
||||
import threading, time
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
class Piuka(object):
|
||||
"""
|
||||
Create a PiukaEngine object
|
||||
|
||||
:proxies -> dict: Proxies URI, like {"http": "http://localhost"}, see the doc for module "requests".
|
||||
:headers -> dict: HTTP headers, like {"referer": "https://www.ghink.net"}, see the doc for module "requests".
|
||||
:thread -> int : Thread num limit. Default value is you cpu core num.
|
||||
:timeout -> int : Time limit for a single request, in seconds. Default value is 2.
|
||||
:flush -> int : Threshold of data written to disk in Byte(s). Default value is 100MB.
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
# Consts
|
||||
self.__VERSION = ("Alpha", 1, 0, 0)
|
||||
|
||||
# Get proxies
|
||||
self.__proxies = kwargs.get("proxies", None)
|
||||
assert type(self.__proxies) == dict or self.__proxies is None
|
||||
|
||||
# Get headers
|
||||
ua = "Piuka{}/{}.{}.{}".format(
|
||||
self.__VERSION[0], self.__VERSION[1], self.__VERSION[2], self.__VERSION[3]
|
||||
)
|
||||
self.__headers = kwargs.get("headers", {"user-agent": ua})
|
||||
assert type(self.__headers) == dict
|
||||
# Use lowercase uniformly
|
||||
for key, value in self.__headers.items():
|
||||
self.__headers[key.lower()] = value
|
||||
if key.lower() != key:
|
||||
del self.__headers[key]
|
||||
# Custom UA
|
||||
if "user-agent" not in self.__headers:
|
||||
self.__headers["user-agent"] = ua
|
||||
|
||||
# Get timeout
|
||||
self.__timeout = kwargs.get("timeout", 2)
|
||||
assert type(self.__timeout) == int
|
||||
|
||||
# Get thread num limit
|
||||
self.__thread_num = kwargs.get("thread", 4) # need psutil to read cpu core count
|
||||
|
||||
# Threshold of data written to disk
|
||||
self.__flush_size = kwargs.get("flush", (1024 ** 2) * 100)
|
||||
|
||||
# Download queue
|
||||
self.__queue = []
|
||||
# Cache pool
|
||||
self.cache = {}
|
||||
# Thread tasks pool
|
||||
self.__thread = ["" for _ in range(self.__thread_num)]
|
||||
# Tasks status pool
|
||||
self.status = {}
|
||||
|
||||
# Start manager daemon thread
|
||||
self.__manager_thread = threading.Thread(target=self.__manager, name="Piuka Thread Manager")
|
||||
self.__manager_thread.daemon = True
|
||||
self.__manager_thread.start()
|
||||
|
||||
def add(self, urls, dests, timeout=None):
|
||||
"""
|
||||
Add download task
|
||||
|
||||
:urls* -> str | list | tuple: URL(s) of the resource.
|
||||
:dests* -> str | list | tuple: File's destination(s), need to correspond to the URL(s) in turn.
|
||||
:timeout -> int: Time limit for a single request, in seconds.
|
||||
"""
|
||||
assert type(urls) in (list, tuple, str)
|
||||
assert type(dests) in (list, tuple, str)
|
||||
if type(urls) in (list, tuple) and type(dests) in (list, tuple):
|
||||
assert len(urls) == len(dests)
|
||||
if timeout is None:
|
||||
timeout = self.__timeout
|
||||
assert type(timeout) == int
|
||||
|
||||
# Construct a traversable object
|
||||
if type(urls) == str and type(dests) == str:
|
||||
urls = (urls,)
|
||||
dests = (dests,)
|
||||
|
||||
queue = []
|
||||
for i in range(len(urls)):
|
||||
task_id = "{}-{}".format(time.time(), random.randint(1, 1000))
|
||||
queue.append((urls[i], dests[i], task_id))
|
||||
self.__queue.extend(queue)
|
||||
|
||||
return queue
|
||||
|
||||
def __manager(self):
|
||||
while True:
|
||||
for i in range(self.__thread_num):
|
||||
if not self.__thread[i] and self.__queue:
|
||||
config = []
|
||||
config.extend(self.__queue.pop(0))
|
||||
config.append(i)
|
||||
config = tuple(config)
|
||||
self.__thread[i] = threading.Thread(target=self.__worker, args=config,
|
||||
name="Piuka Worker Thread #{}".format(i + 1))
|
||||
self.__thread[i].start()
|
||||
time.sleep(0.1)
|
||||
|
||||
def __worker(self, url, dest, task_id, i):
|
||||
if dest != "::memory::":
|
||||
with open(dest, "wb") as file:
|
||||
error_count = 0
|
||||
while True:
|
||||
try:
|
||||
if error_count >= 10:
|
||||
self.status[task_id] = (False, e)
|
||||
self.__thread[i] = ""
|
||||
return
|
||||
else:
|
||||
file_header = requests.head(url, headers=self.__headers, timeout=self.__timeout)
|
||||
except Exception as e:
|
||||
error_count += 1
|
||||
else:
|
||||
if file_header.status_code // 100 == 2:
|
||||
break
|
||||
else:
|
||||
e = "http failed"
|
||||
error_count += 1
|
||||
|
||||
# Calc slice size
|
||||
length = int(file_header.headers["content-length"])
|
||||
slice_count = length // self.__flush_size
|
||||
if not slice_count:
|
||||
slice_count = 1
|
||||
for j in range(0, length, length // slice_count):
|
||||
header = self.__headers
|
||||
next_size = j + (length // slice_count) - 1
|
||||
if next_size > length:
|
||||
next_size = length
|
||||
header["range"] = "bytes={}-{}".format(j, next_size)
|
||||
error_count = 0
|
||||
while True:
|
||||
try:
|
||||
if error_count >= 10:
|
||||
self.status[task_id] = (False, e)
|
||||
self.__thread[i] = ""
|
||||
return
|
||||
else:
|
||||
file_object = requests.get(url, headers=header, timeout=self.__timeout)
|
||||
except Exception as e:
|
||||
error_count += 1
|
||||
else:
|
||||
if file_object.status_code // 100 == 2:
|
||||
break
|
||||
else:
|
||||
e = "http failed"
|
||||
error_count += 1
|
||||
file.write(file_object.content)
|
||||
self.status[task_id] = (True, None)
|
||||
self.__thread[i] = ""
|
||||
elif dest == "::memory::":
|
||||
error_count = 0
|
||||
while True:
|
||||
try:
|
||||
if error_count >= 10:
|
||||
self.status[task_id] = (False, e)
|
||||
self.__thread[i] = ""
|
||||
return
|
||||
else:
|
||||
file_object = requests.get(url, headers=self.__headers, timeout=self.__timeout)
|
||||
except Exception as e:
|
||||
error_count += 1
|
||||
else:
|
||||
if file_object.status_code // 100 == 2:
|
||||
break
|
||||
else:
|
||||
e = "http failed"
|
||||
error_count += 1
|
||||
self.cache[url] = file_object.content
|
||||
self.status[task_id] = (True, None)
|
||||
self.__thread[i] = ""
|
Reference in New Issue
Block a user