Commit e69c18a2 authored by Hermann Krumrey's avatar Hermann Krumrey

Improvements to chapter bundling mechanisms

parent d91d3adb
......@@ -54,7 +54,10 @@ def main(args: argparse.Namespace):
))
for c in chapters:
c.download()
if args.list:
print(c)
else:
c.download()
if __name__ == "__main__":
......@@ -72,6 +75,8 @@ if __name__ == "__main__":
parser.add_argument("-f", "--format",
choices={"cbz", "raw"}, default="cbz",
help="The format in which to store the chapters")
parser.add_argument("-l", "--list", action="store_true",
help="Lists all found chapters")
for _scraper_cls in scrapers:
parser.add_argument("--{}-id".format(_scraper_cls.name()),
......
......@@ -42,7 +42,8 @@ class Chapter:
destination_dir: str,
_format: str,
page_load_callback: Callable[['Chapter', str], List[str]],
title: Optional[str] = None
title: Optional[str] = None,
group: Optional[str] = None
):
"""
Initializes the manga chapter
......@@ -55,6 +56,7 @@ class Chapter:
:param _format: The format in which to store the chapter when
downloading by default
:param title: The title of the chapter
:param group: The group that scanlated this chapter
:param page_load_callback:
"""
self.logger = logging.getLogger(self.__class__.__name__)
......@@ -66,10 +68,13 @@ class Chapter:
self.format = _format
self._page_load_callback = page_load_callback
self._pages = [] # type: List[str]
self._additional_urls = [] # type: List[str]
self._last_additional_urls = [] # type: List[str]
self.group = group
self.title = title
if self.chapter_number == "":
self.chapter_number = "0"
if self.chapter_number == "" or chapter_number == "0":
self.chapter_number = "0.0"
@property
def name(self) -> str:
......@@ -77,8 +82,10 @@ class Chapter:
:return: The name of the chapter
"""
name = "{} - Chapter {}".format(self.series_name, self.chapter_number)
if self.title is not None:
if self.title is not None and self.title != "":
name += " - " + self.title
if self.group is not None and self.group != "":
name += " ({})".format(self.group)
return name
@property
......@@ -87,10 +94,72 @@ class Chapter:
Lazy-loads the URLs of the chapter's page images
:return: The list of page images, in the correct order
"""
if len(self._pages) == 0:
new_urls = self._last_additional_urls != self._additional_urls
if len(self._pages) == 0 or new_urls:
self._pages = self._page_load_callback(self, self.url)
for url in self._additional_urls:
self._pages += self._page_load_callback(self, url)
self._last_additional_urls = list(self._additional_urls)
return self._pages
@property
def macro_chapter(self) -> int:
"""
Calculates the 'macro' chapter number. For example:
12 -> 12
15.5 -> 15
EX4 -> 4
:return: The macro chapter number
"""
macro = self.chapter_number.split(".")[0]
macro_num = ""
for char in macro:
if char.isnumeric():
macro_num += char
return int(macro_num)
@property
def micro_chapter(self) -> int:
"""
Calculates the 'micro' chapter number. For example:
12 -> 0
15.5 -> 5
EX4 -> 0
:return: The micro chapter number
"""
try:
micro = self.chapter_number.split(".")[1]
micro_num = ""
for char in micro:
if char.isnumeric():
micro_num += char
return int(micro_num)
except IndexError:
return 0
@property
def is_special(self) -> bool:
"""
:return: Whether or not this is a 'special' chapter (Omake etc)
"""
if "." in self.chapter_number or self.macro_chapter == 0:
return True
else:
try:
int(self.chapter_number)
return False
except ValueError:
return True
def add_additional_url(self, url: str):
"""
Adds an additional URL.
Useful for multi-part chapters
:param url: The URL to add
:return: None
"""
self._additional_urls.append(url)
def download(
self,
file_path_override: Optional[str] = None,
......@@ -158,3 +227,14 @@ class Chapter:
:return: The string representation of the object
"""
return self.name
def __eq__(self, other: object) -> bool:
"""
Checks for equality with other objects
:param other: The other object
:return: Whether or not the objects are the same
"""
if not isinstance(other, Chapter):
return False
else:
return other.url == self.url
......@@ -94,7 +94,21 @@ class Scraper:
url = self.generate_url(_id)
chapters = self._load_chapters(str(url))
chapters = self._remove_other_languages(chapters)
chapters = self._sort_chapters(chapters)
chapters = self._deduplicate_chapters(chapters)
chapters = self._combine_multipart_chapters(chapters)
return chapters
@staticmethod
def _sort_chapters(chapters: List[Chapter]) -> List[Chapter]:
"""
Sorts a list of chapters. First by their total chapter number,
then their macro chapter number
:param chapters:
:return:
"""
# Both sort steps are necessary!
chapters.sort(
key=lambda x: str(x.chapter_number).zfill(15)
......@@ -102,8 +116,134 @@ class Scraper:
chapters.sort(
key=lambda x: str(x.chapter_number.split(".")[0]).zfill(15)
)
return chapters
def _remove_other_languages(self, chapters: List[Chapter]) \
-> List[Chapter]:
"""
Removes unwanted languages from the chapter list
:param chapters: The chapter list
:return: The chapter list without unwanted language entries
"""
return list(filter(lambda x: x.language in self.languages, chapters))
def _combine_multipart_chapters(self, chapters: List[Chapter]) \
-> List[Chapter]:
"""
Combines multipart chapters with each other (e.g. 12.1 and 12.2)
:param chapters: The list of chapter to work through
:return: The new chapter list
"""
if len(chapters) < 2:
return chapters
last_chapter = chapters.pop(0)
combined_chapters = [] # type: List[Chapter]
to_combine = [] # type: List[Chapter]
diff = 1
for chapter in chapters:
new_chapter = last_chapter.macro_chapter != chapter.macro_chapter
if chapter.micro_chapter == 1 and new_chapter:
self.logger.debug("Marking chapter {} as {}".format(
chapter.chapter_number, chapter.macro_chapter
))
chapter.chapter_number = str(chapter.macro_chapter)
if last_chapter.macro_chapter == chapter.macro_chapter:
same_chapter = \
last_chapter.micro_chapter + diff == chapter.micro_chapter
if last_chapter.micro_chapter == 0 \
and chapter.micro_chapter == 2:
same_chapter = True
diff = 2
if same_chapter:
to_combine.append(chapter)
diff += 1
continue
if len(to_combine) > 0 and last_chapter.micro_chapter in [0, 1]:
self._combine_chapters(last_chapter, to_combine)
to_combine = []
diff = 1
combined_chapters.append(last_chapter)
combined_chapters += to_combine
to_combine = []
last_chapter = chapter
if len(to_combine) > 0 and last_chapter.micro_chapter in [0, 1]:
self._combine_chapters(last_chapter, to_combine)
to_combine = []
combined_chapters.append(last_chapter)
combined_chapters += to_combine
return combined_chapters
def _combine_chapters(self, chapter: Chapter, to_combine: List[Chapter]):
"""
Adds chapters to a chapter
:param chapter: The master chapter
:param to_combine: The chapters to add
:return: None
"""
combined_numbers = [chapter.chapter_number]
chapter.chapter_number = str(chapter.macro_chapter)
for extra in to_combine:
chapter.add_additional_url(extra.url)
combined_numbers.append(extra.chapter_number)
self.logger.debug("Combined chapters: {}".format(combined_numbers))
def _deduplicate_chapters(self, chapters: List[Chapter]) -> List[Chapter]:
"""
Removes duplicate chapters from a list
The chapter to use is based on which scanlation group was most often
found in the other chapters
:param chapters: The chapters to work through
:return: The deduplicated list of chapters
"""
if len(chapters) < 2:
return chapters
groups = {}
chapter_map = {}
for chapter in chapters:
if chapter.group not in groups:
groups[chapter.group] = 1
else:
groups[chapter.group] += 1
if chapter.chapter_number not in chapter_map:
chapter_map[chapter.chapter_number] = []
chapter_map[chapter.chapter_number].append(chapter)
for chapter_number, elements in chapter_map.items():
if len(elements) > 1:
best = max(elements, key=lambda x: groups[x.group])
chapter_map[chapter_number] = [best]
deduplicated = []
for chapter in chapters:
best_chapter = chapter_map[chapter.chapter_number][0]
if best_chapter == chapter:
deduplicated.append(chapter)
else:
self.logger.debug("Discarding duplicate chapter {}"
.format(chapter))
return deduplicated
def _load_chapters(self, url: str) -> List[Chapter]:
"""
Scraper-specific implementation that loads chapters from the website
......
......@@ -95,6 +95,7 @@ class MangaDexScraper(Scraper):
self.format,
self.get_image_pages,
chapter["title"],
chapter["group_name"]
))
return chapters
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment