diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/Piuka.iml b/.idea/Piuka.iml new file mode 100644 index 0000000..8b8c395 --- /dev/null +++ b/.idea/Piuka.iml @@ -0,0 +1,12 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..d56657a --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..16bc563 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/README.md b/README.md index 7627ac7..e040808 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,6 @@ # Piuka -Piuka Download Engine with Python \ No newline at end of file +Piuka(Ukrainian: Річка Richka, Means River) Download Engine with Python + +### Known bugs: +#### 1.When two identical files are downloaded consecutively using disk buffering mode, the second file is incomplete □ \ No newline at end of file diff --git a/piuka.py b/piuka.py new file mode 100644 index 0000000..a6c50a5 --- /dev/null +++ b/piuka.py @@ -0,0 +1,180 @@ +# -*- coding: utf-8 -*- +import random +import threading, time + +import requests + + +class Piuka(object): + """ + Create a PiukaEngine object + + :proxies -> dict: Proxies URI, like {"http": "http://localhost"}, see the doc for module "requests". + :headers -> dict: HTTP headers, like {"referer": "https://www.ghink.net"}, see the doc for module "requests". + :thread -> int : Thread num limit. Default value is you cpu core num. + :timeout -> int : Time limit for a single request, in seconds. Default value is 2. + :flush -> int : Threshold of data written to disk in Byte(s). Default value is 100MB. + """ + + def __init__(self, **kwargs): + # Consts + self.__VERSION = ("Alpha", 1, 0, 0) + + # Get proxies + self.__proxies = kwargs.get("proxies", None) + assert type(self.__proxies) == dict or self.__proxies is None + + # Get headers + ua = "Piuka{}/{}.{}.{}".format( + self.__VERSION[0], self.__VERSION[1], self.__VERSION[2], self.__VERSION[3] + ) + self.__headers = kwargs.get("headers", {"user-agent": ua}) + assert type(self.__headers) == dict + # Use lowercase uniformly + for key, value in self.__headers.items(): + self.__headers[key.lower()] = value + if key.lower() != key: + del self.__headers[key] + # Custom UA + if "user-agent" not in self.__headers: + self.__headers["user-agent"] = ua + + # Get timeout + self.__timeout = kwargs.get("timeout", 2) + assert type(self.__timeout) == int + + # Get thread num limit + self.__thread_num = kwargs.get("thread", 4) # need psutil to read cpu core count + + # Threshold of data written to disk + self.__flush_size = kwargs.get("flush", (1024 ** 2) * 100) + + # Download queue + self.__queue = [] + # Cache pool + self.cache = {} + # Thread tasks pool + self.__thread = ["" for _ in range(self.__thread_num)] + # Tasks status pool + self.status = {} + + # Start manager daemon thread + self.__manager_thread = threading.Thread(target=self.__manager, name="Piuka Thread Manager") + self.__manager_thread.daemon = True + self.__manager_thread.start() + + def add(self, urls, dests, timeout=None): + """ + Add download task + + :urls* -> str | list | tuple: URL(s) of the resource. + :dests* -> str | list | tuple: File's destination(s), need to correspond to the URL(s) in turn. + :timeout -> int: Time limit for a single request, in seconds. + """ + assert type(urls) in (list, tuple, str) + assert type(dests) in (list, tuple, str) + if type(urls) in (list, tuple) and type(dests) in (list, tuple): + assert len(urls) == len(dests) + if timeout is None: + timeout = self.__timeout + assert type(timeout) == int + + # Construct a traversable object + if type(urls) == str and type(dests) == str: + urls = (urls,) + dests = (dests,) + + queue = [] + for i in range(len(urls)): + task_id = "{}-{}".format(time.time(), random.randint(1, 1000)) + queue.append((urls[i], dests[i], task_id)) + self.__queue.extend(queue) + + return queue + + def __manager(self): + while True: + for i in range(self.__thread_num): + if not self.__thread[i] and self.__queue: + config = [] + config.extend(self.__queue.pop(0)) + config.append(i) + config = tuple(config) + self.__thread[i] = threading.Thread(target=self.__worker, args=config, + name="Piuka Worker Thread #{}".format(i + 1)) + self.__thread[i].start() + time.sleep(0.1) + + def __worker(self, url, dest, task_id, i): + if dest != "::memory::": + with open(dest, "wb") as file: + error_count = 0 + while True: + try: + if error_count >= 10: + self.status[task_id] = (False, e) + self.__thread[i] = "" + return + else: + file_header = requests.head(url, headers=self.__headers, timeout=self.__timeout) + except Exception as e: + error_count += 1 + else: + if file_header.status_code // 100 == 2: + break + else: + e = "http failed" + error_count += 1 + + # Calc slice size + length = int(file_header.headers["content-length"]) + slice_count = length // self.__flush_size + if not slice_count: + slice_count = 1 + for j in range(0, length, length // slice_count): + header = self.__headers + next_size = j + (length // slice_count) - 1 + if next_size > length: + next_size = length + header["range"] = "bytes={}-{}".format(j, next_size) + error_count = 0 + while True: + try: + if error_count >= 10: + self.status[task_id] = (False, e) + self.__thread[i] = "" + return + else: + file_object = requests.get(url, headers=header, timeout=self.__timeout) + except Exception as e: + error_count += 1 + else: + if file_object.status_code // 100 == 2: + break + else: + e = "http failed" + error_count += 1 + file.write(file_object.content) + self.status[task_id] = (True, None) + self.__thread[i] = "" + elif dest == "::memory::": + error_count = 0 + while True: + try: + if error_count >= 10: + self.status[task_id] = (False, e) + self.__thread[i] = "" + return + else: + file_object = requests.get(url, headers=self.__headers, timeout=self.__timeout) + except Exception as e: + error_count += 1 + else: + if file_object.status_code // 100 == 2: + break + else: + e = "http failed" + error_count += 1 + self.cache[url] = file_object.content + self.status[task_id] = (True, None) + self.__thread[i] = "" \ No newline at end of file