From d386cdcd88f8169fa6e92c985d0f137f69efe659 Mon Sep 17 00:00:00 2001 From: Joakim Holm Date: Thu, 6 Apr 2023 21:38:19 +0200 Subject: [PATCH] Add code --- grawlix/__init__.py | 4 + grawlix/__main__.py | 79 +++++++++++++++ grawlix/arguments.py | 37 +++++++ grawlix/book.py | 59 +++++++++++ grawlix/config.py | 41 ++++++++ grawlix/encryption.py | 38 +++++++ grawlix/exceptions.py | 20 ++++ grawlix/logging.py | 31 ++++++ grawlix/output/__init__.py | 79 +++++++++++++++ grawlix/output/cbz.py | 18 ++++ grawlix/output/epub.py | 5 + grawlix/output/output_format.py | 68 +++++++++++++ grawlix/sources/__init__.py | 38 +++++++ grawlix/sources/flipp.py | 172 ++++++++++++++++++++++++++++++++ grawlix/sources/mangaplus.py | 102 +++++++++++++++++++ grawlix/sources/saxo.py | 119 ++++++++++++++++++++++ grawlix/sources/source.py | 60 +++++++++++ grawlix/sources/webtoons.py | 92 +++++++++++++++++ grawlix/utils/__init__.py | 11 ++ pyproject.toml | 49 +++++++++ setup.py | 3 + shell.nix | 49 +++++++++ 22 files changed, 1174 insertions(+) create mode 100644 grawlix/__init__.py create mode 100644 grawlix/__main__.py create mode 100644 grawlix/arguments.py create mode 100644 grawlix/book.py create mode 100644 grawlix/config.py create mode 100644 grawlix/encryption.py create mode 100644 grawlix/exceptions.py create mode 100644 grawlix/logging.py create mode 100644 grawlix/output/__init__.py create mode 100644 grawlix/output/cbz.py create mode 100644 grawlix/output/epub.py create mode 100644 grawlix/output/output_format.py create mode 100644 grawlix/sources/__init__.py create mode 100644 grawlix/sources/flipp.py create mode 100644 grawlix/sources/mangaplus.py create mode 100644 grawlix/sources/saxo.py create mode 100644 grawlix/sources/source.py create mode 100644 grawlix/sources/webtoons.py create mode 100644 grawlix/utils/__init__.py create mode 100644 pyproject.toml create mode 100644 setup.py create mode 100644 shell.nix diff --git a/grawlix/__init__.py b/grawlix/__init__.py new file mode 100644 index 0000000..9df4663 --- /dev/null +++ b/grawlix/__init__.py @@ -0,0 +1,4 @@ +from .encryption import Encryption, AESEncryption +from .book import Book, SingleFile, OnlineFile, Metadata + +__version__ = "0.1.0" diff --git a/grawlix/__main__.py b/grawlix/__main__.py new file mode 100644 index 0000000..70f2b52 --- /dev/null +++ b/grawlix/__main__.py @@ -0,0 +1,79 @@ +from .book import Book, Series +from .config import load_config, Config, SourceConfig +from .exceptions import SourceNotAuthenticated +from .sources import find_source, Source +from .output import download_book +from . import arguments, logging + +from typing import Tuple +from rich.progress import Progress +from functools import partial + + +def get_login(source: Source, config: Config, options) -> Tuple[str, str]: + """ + Get login credentials for source + + :param source: Source to authenticate + :param config: Content of config file + :param options: Command line options + :returns: Login credentials + """ + source_name = source.name.lower() + if source_name in config.sources: + username = config.sources[source_name].username or options.username + password = config.sources[source_name].password or options.password + else: + username = options.username + password = options.password + return username, password + + +def authenticate(source: Source, config: Config, options): + """ + Authenticate with source + + :param source: Source to authenticate + :param config: Content of config file + :param options: Command line options + """ + if source.supports_login: + username, password = get_login(source, config, options) + source.login(username, password) + else: + raise SourceNotAuthenticated + + +def main() -> None: + args = arguments.parse_arguments() + config = load_config() + for url in args.urls: + source: Source = find_source(url) + if source.requires_authentication: + authenticate(source, config, args) + result = source.download(url) + if isinstance(result, Book): + with logging.progress(result.metadata.title, source.name) as progress: + download_with_progress(result, progress) + elif isinstance(result, Series): + with logging.progress(result.title, source.name, len(result.book_ids)) as progress: + for book_id in result.book_ids: + book = source.download_book_from_id(book_id) + download_with_progress(book, progress) + + +def download_with_progress(book: Book, progress: Progress): + """ + Download book with progress bar in cli + + :param book: Book to download + :param progress: Progress object + """ + task = logging.add_book(progress, book) + update_function = partial(progress.advance, task) + download_book(book, update_function) + progress.advance(task, 1) + + +if __name__ == "__main__": + main() diff --git a/grawlix/arguments.py b/grawlix/arguments.py new file mode 100644 index 0000000..26bb922 --- /dev/null +++ b/grawlix/arguments.py @@ -0,0 +1,37 @@ +from grawlix import __version__ + +import argparse + +def parse_arguments(): + # Help + parser = argparse.ArgumentParser( + prog = "grawlix", + description = "Download ebooks" + ) + parser.add_argument( + '-v', + '--version', + action = "version", + version = f"grawlix {__version__}" + ) + # Basics + parser.add_argument( + 'urls', + help = "Links to ebooks", + nargs = "*" + ) + # Authentication + parser.add_argument( + '-u', + '--username', + help = "Username for login", + dest = "username", + ) + parser.add_argument( + '-p', + '--password', + help = "Password for login", + dest = "password", + ) + # Outputs + return parser.parse_args() diff --git a/grawlix/book.py b/grawlix/book.py new file mode 100644 index 0000000..618ccc2 --- /dev/null +++ b/grawlix/book.py @@ -0,0 +1,59 @@ +from grawlix import Encryption +from dataclasses import dataclass +from typing import Optional, Union, TypeVar, Generic + +@dataclass(slots=True) +class Metadata: + """Metadata about a book""" + title: str + series: Optional[str] = None + publisher: Optional[str] = None + identifier: Optional[str] = None + + +@dataclass(slots=True) +class OnlineFile: + """Instructions for downloading an online file""" + url: str + extension: str + encryption: Optional[Encryption] = None + headers: Optional[dict[str, str]] = None + + +@dataclass(slots=True) +class SingleFile: + """Bookdata in the form of a single file""" + file: OnlineFile + + +@dataclass(slots=True) +class ImageList: + """ + List of images + Mostly used for comic books + """ + images: list[OnlineFile] + +BookData = Union[ + SingleFile, + ImageList +] + +@dataclass(slots=True) +class Book: + """Stores information about a book""" + metadata: Metadata + data: BookData + +T = TypeVar("T") + +@dataclass(slots=True) +class Series(Generic[T]): + """Stores a series of books""" + title: str + book_ids: list[T] + +Result = Union[ + Book, + Series[T] +] diff --git a/grawlix/config.py b/grawlix/config.py new file mode 100644 index 0000000..58c3f32 --- /dev/null +++ b/grawlix/config.py @@ -0,0 +1,41 @@ +from dataclasses import dataclass +from typing import Optional +import tomli +import appdirs +import os + + +@dataclass(slots=True) +class SourceConfig: + """Stores configuration for source""" + username: Optional[str] + password: Optional[str] + + +@dataclass(slots=True) +class Config: + """Grawlix configuration""" + sources: dict[str, SourceConfig] + + +def load_config() -> Config: + """ + Load config from disk + + :returns: Config object + """ + config_dir = appdirs.user_config_dir("grawlix", "jo1gi") + config_file = os.path.join(config_dir, "grawlix.toml") + if os.path.exists(config_file): + with open(config_file, "rb") as f: + config_dict = tomli.load(f) + else: + config_dict = {} + sources = {} + if "source" in config_dict: + for key, values in config_dict["source"].items(): + sources[key] = SourceConfig ( + username = values.get("username"), + password = values.get("password"), + ) + return Config(sources) diff --git a/grawlix/encryption.py b/grawlix/encryption.py new file mode 100644 index 0000000..2a1a999 --- /dev/null +++ b/grawlix/encryption.py @@ -0,0 +1,38 @@ +from Crypto.Cipher import AES +from typing import Union +from dataclasses import dataclass + + +@dataclass(slots=True) +class AESEncryption: + key: bytes + iv: bytes + + +@dataclass(slots=True) +class XOrEncryption: + key: bytes + +Encryption = Union[ + AESEncryption, + XOrEncryption +] + +def decrypt(data: bytes, encryption: Encryption) -> bytes: + """ + Decrypt data with specified encryption algorithm + + :param data: Bytes to decrypt + :param encryption: Information about how to decrypt + :returns: Decrypted data + """ + if isinstance(encryption, AESEncryption): + cipher = AES.new(encryption.key, AES.MODE_CBC, encryption.iv) + return cipher.decrypt(data) + if isinstance(encryption, XOrEncryption): + key_length = len(encryption.key) + decoded = [] + for i in range(0, len(data)): + decoded.append(data[i] ^ encryption.key[i % key_length]) + return bytes(decoded) + raise NotImplemented diff --git a/grawlix/exceptions.py b/grawlix/exceptions.py new file mode 100644 index 0000000..6c25975 --- /dev/null +++ b/grawlix/exceptions.py @@ -0,0 +1,20 @@ +class GrawlixError(Exception): + pass + +class DataNotFound(GrawlixError): + pass + +class InvalidUrl(GrawlixError): + pass + +class UnsupportedOutputFormat(GrawlixError): + pass + +class NoSourceFound(GrawlixError): + pass + +class SourceNotAuthenticated(GrawlixError): + pass + +class MissingArgument(GrawlixError): + pass diff --git a/grawlix/logging.py b/grawlix/logging.py new file mode 100644 index 0000000..c935d3a --- /dev/null +++ b/grawlix/logging.py @@ -0,0 +1,31 @@ +from grawlix.book import Book + +from rich.console import Console +from rich.progress import Progress, BarColumn, ProgressColumn, TaskID, SpinnerColumn +import rich + +from typing import Union +from dataclasses import dataclass + +console = Console(stderr=True) + +def progress(category_name: str, source_name: str, count=1) -> Progress: + if count > 1: + console.print(f"Downloading [yellow not bold]{count}[/] books in [blue]{category_name}[/] from [magenta]{source_name}[/]") + else: + console.print(f"Downloading [blue bold]{category_name}[/] from [magenta]{source_name}[/]") + progress = Progress( + SpinnerColumn(), + "{task.description}", + BarColumn(), + "[progress.percentage]{task.percentage:>3.0f}%", + console = console + ) + return progress + +def add_book(progress: Progress, book: Book) -> TaskID: + task = progress.add_task( + f"[blue]{book.metadata.title}[/]", + total = 1 + ) + return task diff --git a/grawlix/output/__init__.py b/grawlix/output/__init__.py new file mode 100644 index 0000000..f69e1c1 --- /dev/null +++ b/grawlix/output/__init__.py @@ -0,0 +1,79 @@ +from grawlix.book import Book, BookData, SingleFile, ImageList, OnlineFile +from grawlix.exceptions import GrawlixError + +from .output_format import OutputFormat +from .epub import Epub +from .cbz import Cbz + +from typing import Callable +from pathlib import Path +import os + +def download_book(book: Book, update_func: Callable) -> None: + """ + Download and write book to disk + + :param book: Book to download + """ + output_format = get_default_format(book.data) + location = format_output_location(book, output_format) + parent = Path(location).parent + if not parent.exists(): + os.makedirs(parent) + if isinstance(book.data, SingleFile): + output_format.dl_single_file(book.data, location, update_func) + elif isinstance(book.data, ImageList): + output_format.dl_image_list(book.data, location, update_func) + else: + raise NotImplementedError + + +def format_output_location(book: Book, output_format: OutputFormat) -> str: + """ + Create path to output location of book + + :param book: Book to download + :param output_format: Output format of book + """ + series = book.metadata.series or "UNKNOWN" + return f"{series}/{book.metadata.title}.{output_format.extension}" + + +def get_default_format(bookdata: BookData) -> OutputFormat: + """ + Get default output format for bookdata. + Should only be used if no format was specified by the user + + :param bookdata: Content of book + :returns: OutputFormat object matching the default + """ + if isinstance(bookdata, SingleFile): + return output_format_from_str(bookdata.file.extension) + if isinstance(bookdata, ImageList): + return Cbz() + raise GrawlixError + + +def output_format_from_str(name: str) -> OutputFormat: + """ + Convert string to outputformat object + + :param name: Name of output format + :returns: OutputFormat object + """ + for output_format in get_output_formats(): + if output_format.extension == name: + return output_format() + raise GrawlixError + + +def get_output_formats() -> list[type[OutputFormat]]: + """ + Get a list of all available output formats + + :returns: List of available output format classes + """ + return [ + Cbz, + Epub, + ] diff --git a/grawlix/output/cbz.py b/grawlix/output/cbz.py new file mode 100644 index 0000000..b9ed53a --- /dev/null +++ b/grawlix/output/cbz.py @@ -0,0 +1,18 @@ +from .output_format import OutputFormat, Update +from grawlix.book import ImageList + +import zipfile + +class Cbz(OutputFormat): + """Comic book zip file""" + + extension: str = "cbz" + + def dl_image_list(self, book: ImageList, location: str, update: Update) -> None: + image_count = len(book.images) + with zipfile.ZipFile(location, mode="w") as zip: + for n, file in enumerate(book.images): + content = self._download_file(file) + zip.writestr(f"Image {n}.{file.extension}", content) + if update: + update(1/image_count) diff --git a/grawlix/output/epub.py b/grawlix/output/epub.py new file mode 100644 index 0000000..ef634a0 --- /dev/null +++ b/grawlix/output/epub.py @@ -0,0 +1,5 @@ +from grawlix.book import Book, SingleFile +from .output_format import OutputFormat + +class Epub(OutputFormat): + extension = "epub" diff --git a/grawlix/output/output_format.py b/grawlix/output/output_format.py new file mode 100644 index 0000000..af32e5d --- /dev/null +++ b/grawlix/output/output_format.py @@ -0,0 +1,68 @@ +from grawlix.book import Book, SingleFile, OnlineFile, ImageList +from grawlix.exceptions import UnsupportedOutputFormat +from grawlix.encryption import decrypt + +import requests +from typing import Callable, Optional + +Update = Optional[Callable[[float], None]] + +class OutputFormat: + # Extension for output files + extension: str = "" + + def __init__(self): + self._session = requests.Session() + + + def dl_single_file(self, book: SingleFile, location: str, update_func: Update) -> None: + """ + Download and write an `grawlix.SingleFile` to disk + + :param book: Book to download + :param location: Path to where the file is written + :raises UnsupportedOutputFormat: If datatype is not supported by format + """ + if not book.file.extension == self.extension: + raise UnsupportedOutputFormat + self._download_and_write_file(book.file, location) + + + def dl_image_list(self, book: ImageList, location: str, update_func: Update) -> None: + """ + Download and write an `grawlix.ImageList` to disk + + :param book: Book to download + :param location: Path to where the file is written + :raises UnsupportedOutputFormat: If datatype is not supported by format + """ + raise UnsupportedOutputFormat + + + def _download_file(self, file: OnlineFile) -> bytes: + """ + Download `grawlix.OnlineFile` + + :param file: File to download + :returns: Content of downloaded file + """ + response = self._session.get( + file.url, + headers = file.headers + ) + content = response.content + if file.encryption is not None: + content = decrypt(content, file.encryption) + return content + + + def _download_and_write_file(self, file: OnlineFile, location: str) -> None: + """ + Download `grawlix.OnlineFile` and write to content to disk + + :param file: File to download + :param location: Path to where the file is written + """ + content = self._download_file(file) + with open(location, "wb") as f: + f.write(content) diff --git a/grawlix/sources/__init__.py b/grawlix/sources/__init__.py new file mode 100644 index 0000000..c1208b2 --- /dev/null +++ b/grawlix/sources/__init__.py @@ -0,0 +1,38 @@ +from grawlix.exceptions import NoSourceFound + +from .source import Source +from .flipp import Flipp +from .mangaplus import MangaPlus +from .saxo import Saxo +from .webtoons import Webtoons + +import re + + +def find_source(url: str) -> Source: + """ + Find source that matches url + + :param url: Url of book to download + :returns: Source for downloading url + """ + for cls in get_source_classes(): + for num, match in enumerate(cls.match): + if re.match(match, url): + source = cls() + return source + raise NoSourceFound + + +def get_source_classes() -> list[type[Source]]: + """ + Get all source types + + :returns: A list of all available source types + """ + return [ + Flipp, + MangaPlus, + Saxo, + Webtoons + ] diff --git a/grawlix/sources/flipp.py b/grawlix/sources/flipp.py new file mode 100644 index 0000000..1f9f007 --- /dev/null +++ b/grawlix/sources/flipp.py @@ -0,0 +1,172 @@ +from .source import Source +from grawlix.book import Book, Metadata, ImageList, OnlineFile, Series, Result +from grawlix.exceptions import InvalidUrl, DataNotFound +from grawlix.utils import get_arg_from_url + +import re +from urllib.parse import urlparse +from typing import Tuple, Optional + +BASEURL = "https://reader.flipp.dk/html5/reader" + +class Flipp(Source): + name: str = "Flipp" + match = [ + r"https?://reader.flipp.dk/html5/reader/production/default.aspx\?pubname=&edid=([^/]+)", + r"https?://magasiner.flipp.dk/flipp/web-app/#/publications/.+" + ] + _authentication_methods: list[str] = [] + _login_cache: Optional[dict] = None + + def download(self, url: str) -> Result: + if re.match(self.match[0], url): + eid = self._get_eid(url) + publication_id = self._get_series_id(eid) + return self._download_book(eid, publication_id) + elif re.match(self.match[1], url): + return self._download_series(url) + raise InvalidUrl + + + def download_book_from_id(self, book_id: Tuple[str, str]) -> Book: + series_id, issue_id = book_id + return self._download_book(issue_id, series_id) + + + def _download_series(self, url: str) -> Series: + """ + Download series with book ids from Flipp + + :param url: Url of series + :returns: Series object + """ + series_id = url.split("/")[-1] + login_info = self._download_login_info() + series_metadata = self._extract_series_data(login_info, series_id) + issues = [] + for issue in series_metadata["issues"]: + issue_id = issue["customIssueCode"] + issues.append((series_id, issue_id)) + return Series( + title = series_metadata["name"], + book_ids = issues + ) + + + def _download_login_info(self) -> dict: + """ + Download login info from Flipp + Will use cache if available + + :returns: Login info + """ + if self._login_cache: + return self._login_cache + login_info = self._session.post( + "https://flippapi.egmontservice.com/api/signin", + headers = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:111.0) Gecko/20100101 Firefox/111.0" + }, + json = { + "email": "", + "password": "", + "token": "", + "languageCulture": "da-DK", + "appId": "", + "appVersion": "", + "uuid": "", + "os": "" + } + ).json() + self.login_cache = login_info + return login_info + + + def _extract_series_data(self, response: dict, series_id: str) -> dict: + """ + Extract metadata about series from login response + + :param response: Login response from Flipp + :param series_id: Id of series + :returns: Metadata about series + """ + for publication in response["publications"]: + if publication["customPublicationCode"] == series_id: + return publication + raise DataNotFound + + + def _download_book(self, issue_id: str, series_id: str) -> Book: + """ + Download book from Flipp + + :param issue_id: Issue identifier + :param series_id: Series identifier + :returns: Book metadata + """ + pages = self._get_pages(issue_id, series_id) + metadata = self._get_metadata(issue_id, series_id) + return Book( + data = ImageList(pages), + metadata = Metadata( + title = f"{metadata['series_name']} {metadata['issueName']}", + series = metadata["series_name"], + identifier = issue_id + ), + ) + + + def _get_metadata(self, issue_id: str, series_id: str) -> dict: + """ + Download and extract issue data + + :param issue_id: Issue id + :param series_id: Series id + :returns: Issue metadata + """ + login_info = self._download_login_info() + series_metadata = self._extract_series_data(login_info, series_id) + for issue in series_metadata["issues"]: + if issue["customIssueCode"] == issue_id: + issue["series_name"] = series_metadata["name"] + return issue + raise DataNotFound + + def _get_eid(self, url: str) -> str: + return get_arg_from_url(url, "edid") + + + def _get_series_id(self, issue_id: str) -> str: + """ + Download series id from issue id + + :param issue_id: Issue id + :returns: Series id + """ + response = self._session.get(f"{BASEURL}/production/default.aspx?pubname=&edid={issue_id}") + # TODO Make faster + search = re.search(r'publicationguid = "([^"]+)', response.text) + if search is None: + raise DataNotFound + return search.group(1) + + + def _get_pages(self, issue_id: str, series_id: str) -> list[OnlineFile]: + """ + Download page metadata for book + + :param issue_id: Issue id + :param series_id: Series id + :return: Page image links + """ + response = self._session.get( + f"{BASEURL}/get_page_groups_from_eid.aspx?pubid={series_id}&eid={issue_id}", + ) + result = [] + for page in response.json()["pageGroups"]: + # Find image id in low quality image url + low_quality_url = urlparse(page["pages"][0]["image"]) + image_id = low_quality_url.path[1:-9] + high_quality_url = f"http://pages.cdn.pagesuite.com/{image_id}/highpage.jpg?method=true" + result.append(OnlineFile(high_quality_url, "jpg")) + return result diff --git a/grawlix/sources/mangaplus.py b/grawlix/sources/mangaplus.py new file mode 100644 index 0000000..d72cb85 --- /dev/null +++ b/grawlix/sources/mangaplus.py @@ -0,0 +1,102 @@ +from .source import Source +from grawlix.encryption import XOrEncryption +from grawlix.book import Book, Metadata, ImageList, OnlineFile, Series, Result +from grawlix.exceptions import InvalidUrl + +import re +import blackboxprotobuf +import json +import rich + +class MangaPlus(Source): + name: str = "Manga Plus" + match = [ + r"https?://mangaplus.shueisha.co.jp/viewer/\d+", + r"https?://mangaplus.shueisha.co.jp/titles/\d+" + ] + _authentication_methods: list[str] = [] + + + def download(self, url: str) -> Result: + if re.match(self.match[0], url): + issue_id = url.split('/')[-1] + return self._download_issue(issue_id) + if re.match(self.match[1], url): + series_id = url.split("/")[-1] + return self._download_series(series_id) + raise InvalidUrl + + + def download_book_from_id(self, book_id: str) -> Book: + return self._download_issue(book_id) + + + def _download_series(self, series_id: str) -> Series: + """ + Download series from Manga Plus + + :param series_id: Identifier for series + :returns: Series data + """ + content = self._session.get( + f"https://jumpg-api.tokyo-cdn.com/api/title_detailV2", + params = { + "title_id": series_id, + "lang": "eng", + "os": "android", + "os_ver": "32", + "app_ver": "40", + "secret": "2afb69fbb05f57a1856cf75e1c4b6ee6" + }, + ).content + data, _ = blackboxprotobuf.protobuf_to_json(content) + parsed = json.loads(data) + title = parsed["1"]["8"]["1"]["2"] + issues = [] + def add_issues(data: dict, main: str): + if main in data: + x = data[main] + if isinstance(x, list): + for i in x: + issues.append(i["2"]) + else: + issues.append(x["2"]) + for a in parsed["1"]["8"]["28"]: + add_issues(a, "2") + add_issues(a, "3") + add_issues(a, "4") + return Series( + title, + book_ids = issues + ) + + def _download_issue(self, issue_id: str) -> Book: + """ + Download issue from Manga Plus + + :param issue_id: Identifier for issue + :returns: Issue metadata + """ + url = f"https://jumpg-webapi.tokyo-cdn.com/api/manga_viewer?chapter_id={issue_id}&split=yes&img_quality=super_high" + content = self._session.get(url).content + response, _ = blackboxprotobuf.protobuf_to_json(content) + images = [] + parsed = json.loads(response) + for image in parsed["1"]["10"]["1"]: + if "1" in image: + images.append( + OnlineFile( + image["1"]["1"], + extension = "jpg", + encryption = XOrEncryption(bytes.fromhex(image["1"]["5"])) + ) + ) + elif "3" in image: + title = image["3"]["1"]["4"] + return Book( + data = ImageList(images), + metadata = Metadata( + title, + series = parsed["1"]["10"]["5"] + ) + ) diff --git a/grawlix/sources/saxo.py b/grawlix/sources/saxo.py new file mode 100644 index 0000000..0a3574d --- /dev/null +++ b/grawlix/sources/saxo.py @@ -0,0 +1,119 @@ +from grawlix.book import Book, Metadata, SingleFile, OnlineFile +from grawlix import AESEncryption + +import re +from .source import Source + +class Saxo(Source): + name: str = "Saxo" + match = [ + r"https://(www.)?saxo.(com|dk)/[^/]+/.+\d+$" + ] + _authentication_methods = [ "login" ] + user_id: str + + def login(self, username: str, password: str, **kwargs) -> None: + response = self._session.post( + "https://auth-read.saxo.com/auth/token", + data = { + "username": username, + "password": password, + "grant_type": "password", + }, + headers = { + "Content-Type": "application/x-www-form-urlencoded" + } + ) + json = response.json() + bearer_token = json["access_token"] + self._session.headers = { + "Appauthorization": f"bearer {bearer_token}", + "App-Os": "android", + "App-Version": "6.2.4" + } + self.user_id = json["id"] + + + def download(self, url: str) -> Book: + isbn = self._extract_isbn_from_url(url) + book_id = self._get_book_id(isbn) + metadata = self._get_book_metadata(book_id) + ebook_id = metadata["id"] # Id of ebook file + return Book( + metadata = self._extract_metadata(metadata), + data = SingleFile( + OnlineFile( + url = self._get_book_file_link(ebook_id), + extension = "epub", + # Encryption keys extracted from app + encryption = AESEncryption( + key = b"CD3E9D141D8EFC0886912E7A8F3652C4", + iv = b"78CB354D377772F1" + ) + ) + ) + ) + + + def _get_book_id(self, isbn: str) -> str: + """ + Download internal book id of book from isbn + + :param isbn: Isbn of book + :returns: Saxo internal book id + """ + response = self._session.get( + f"https://api-read.saxo.com/api/v2/search/user/{self.user_id}/premium/books/{isbn}" + ) + return response.json()["items"][0]["bookId"] + + + def _get_book_metadata(self, book_id: str) -> dict: + """ + Download metadata of book + + :param book_id: Id of book + :returns: Metadata of book + """ + response = self._session.get( + f"https://api-read.saxo.com/api/v2/book/{book_id}/user/{self.user_id}/details" + ) + return response.json()["ebooks"][0] + + + def _get_book_file_link(self, ebook_id: str) -> str: + """ + Download link to epub file + + :param ebook_id: Id of ebook file + :returns: Link to ebook file + """ + response = self._session.get( + f"https://api-read.saxo.com/api/v1/book/{ebook_id}/content/encryptedstream/" + ) + return response.json()["link"] + + + @staticmethod + def _extract_metadata(metadata: dict) -> Metadata: + """ + Extract metadata from matadata response from Saxo + + :param metadata: Metadata response from saxo + :returns: Metadata formatted as `grawlix.Metadata` + """ + return Metadata(metadata["title"]) + + + @staticmethod + def _extract_isbn_from_url(url: str) -> str: + """ + Extracts isbn from url + + :param url: Url of book + :returns: Isbn of book + """ + isbn_match = re.search(f"\d+$", url) + if isbn_match and isbn_match.group(): + return isbn_match.group() + raise NotImplemented diff --git a/grawlix/sources/source.py b/grawlix/sources/source.py new file mode 100644 index 0000000..8653621 --- /dev/null +++ b/grawlix/sources/source.py @@ -0,0 +1,60 @@ +from grawlix.book import Book, Series, Result + +from typing import Generic, TypeVar, Tuple +import requests + +T = TypeVar("T") + +class Source(Generic[T]): + """ + General class for downloading books from various sources + """ + + name: str = "UNKNOWN" + match: list[str] = [] + _authentication_methods: list[str] = [] + + def __init__(self): + self._session = requests.Session() + + + @property + def requires_authentication(self) -> bool: + """Does the source require authentication to download books""" + return len(self._authentication_methods) > 0 + + + @property + def supports_login(self) -> bool: + """Does the source support authentication with username and password""" + return "login" in self._authentication_methods + + + def login(self, username: str, password: str, **kwargs: str): + """ + Login to source + + :param username: Username of user for source + :param password: Password of user for source + """ + raise NotImplementedError + + + def download(self, url: str) -> Result[T]: + """ + Download book metadata from source + + :param url: Url of book to download + :returns: Book metadata + """ + raise NotImplementedError + + + def download_book_from_id(self, book_id: T) -> Book: + """ + Download book from id + + :param book_id: Internal id of book + :returns: Downloaded book metadata + """ + raise NotImplementedError diff --git a/grawlix/sources/webtoons.py b/grawlix/sources/webtoons.py new file mode 100644 index 0000000..a4f4429 --- /dev/null +++ b/grawlix/sources/webtoons.py @@ -0,0 +1,92 @@ +from .source import Source +from grawlix.book import Book, Metadata, ImageList, OnlineFile, Series, Result +from grawlix.utils import get_arg_from_url +from grawlix.exceptions import InvalidUrl + +from bs4 import BeautifulSoup +import re +from urllib.parse import urlparse + +USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:111.0) Gecko/20100101 Firefox/111.0" +MOBILE_USER_AGENT = "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1" + +class Webtoons(Source[str]): + + name: str = "Webtoons" + match = [ + r"https://www.webtoons.com/../.+/.+/.+/viewer\?title_no=\d+&episode_no=\d+", + r"https://www.webtoons.com/../.+/.+/list\?title_no=\d+" + ] + _authentication_methods: list[str] = [] + + def download(self, url: str) -> Result[str]: + if re.match(self.match[0], url): + return self._download_episode(url) + if re.match(self.match[1], url): + return self._download_series(url) + raise InvalidUrl + + + def download_book_from_id(self, book_id: str) -> Book: + return self._download_episode(book_id) + + + def _download_series(self, url: str) -> Series[str]: + """ + Download a series of webtoons + + :param url: Url of series + :returns: Webtoons series data + """ + parsed_url = urlparse(url) + page = self._session.get( + f"https://m.webtoons.com{parsed_url.path}", + params = parsed_url.query, + headers = { + "User-Agent": MOBILE_USER_AGENT, + }, + cookies = { + "needGDPR": "FALSE", + "needCCPA": "FALSE", + "needCOPPA": "FALSE" + } + ).text + soup = BeautifulSoup(page, "lxml") + title = soup.find("meta", property="og:title").get("content") + episodes = [] + for episode in soup.find_all("li", class_="_episodeItem"): + episode_link = episode.find("a").get("href") + episodes.append(episode_link) + return Series( + title, + book_ids = episodes + ) + + + def _download_episode(self, url: str) -> Book: + """ + Download single webtoon episode + + :param url: Url of episode + :returns: Episode + """ + page = self._session.get(url).text + soup = BeautifulSoup(page, "lxml") + title = soup.find("h1", class_="subj_episode").get("title") + series = soup.find("div", class_="subj_info").find("a").get("title") + images = [] + for image in soup.find("div", class_="viewer_img _img_viewer_area").find_all("img"): + images.append( + OnlineFile( + url = image.get("data-url"), + extension = "png", + headers = { "Referer": "https://www.webtoons.com/" } + ) + ) + return Book( + data = ImageList(images), + metadata = Metadata( + title, + series = series + ) + ) diff --git a/grawlix/utils/__init__.py b/grawlix/utils/__init__.py new file mode 100644 index 0000000..2ee2f07 --- /dev/null +++ b/grawlix/utils/__init__.py @@ -0,0 +1,11 @@ +from grawlix.exceptions import DataNotFound + +from urllib.parse import urlparse, parse_qs + +def get_arg_from_url(url: str, key: str) -> str: + parsed_url = urlparse(url) + query = parse_qs(parsed_url.query) + try: + return query[key][0] + except: + raise DataNotFound diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..9aecbf3 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,49 @@ +[project] +name = "grawlix" +authors = [ + { name = "Joakim Holm", email = "mail@joakimholm.xyz" } +] +description = "CLI tool for downloading ebooks" +readme = "README.md" +keywords = ["ebook", "cli", "downloader"] +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", +] +dependencies = [ + "appdirs", + "beautifulsoup4", + "blackboxprotobuf", + "importlib-resources", + "lxml", + "pycryptodome", + "requests", + "rich", + "tomli" +] +dynamic = ["version"] + +[project.urls] +"Homepage" = "https://github.com/jo1gi/ebook-dl" +"Bugtracker" = "https://github.com/jo1gi/ebook-dl/issues" + +[project.scripts] +audiobook-dl = "grawlix.__main__:main" + + +[build-system] +requires = ["setuptools", "setuptools-scm"] +build-backend = "setuptools.build_meta" + + +[tool.setuptools.dynamic] +version = {attr = "grawlix.__version__"} + +[tool.setuptools.package-data] +mypkg = ["*.txt"] + + +[tool.mypy] +ignore_missing_imports = true +allow_untyped_globals = false +disallow_untyped_calls = true diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..6068493 --- /dev/null +++ b/setup.py @@ -0,0 +1,3 @@ +from setuptools import setup + +setup() diff --git a/shell.nix b/shell.nix new file mode 100644 index 0000000..e0bed9e --- /dev/null +++ b/shell.nix @@ -0,0 +1,49 @@ +with import {}; + +let + blackboxprotobuf = python3Packages.buildPythonPackage rec { + pname = "blackboxprotobuf"; + version = "1.0.1"; + + src = python3Packages.fetchPypi { + inherit pname version; + sha256 = "sha256-IztxTmwkzp0cILhxRioiCvkXfk/sAcG3l6xauGoeHOo="; + }; + + propagatedBuildInputs = with python3Packages; [ + protobuf + ]; + + patchPhase = '' + sed 's/protobuf==3.10.0/protobuf/' requirements.txt > requirements.txt + ''; + + doCheck = false; + }; +in +mkShell { + buildInputs = [ + (python3.withPackages(ps: with ps; [ + appdirs + beautifulsoup4 + blackboxprotobuf + importlib-resources + lxml + pycryptodome + requests + rich + tomli + + # Test + pytest + mypy + types-requests + types-setuptools + + # Build + build + setuptools + twine + ])) + ]; +}