-
Notifications
You must be signed in to change notification settings - Fork 1
/
EpubChapter.py
79 lines (52 loc) · 2.09 KB
/
EpubChapter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from lxml import html
import json
class EpubChapter:
'''A single chapter in an EPub'''
# The id used to reference this chapter as used in content.opf
idref = ""
# The name of the file holding the contents of this EpubChapter
contentfile = ""
# Play order
play_order = 0
# Chapter contents from the contentfile read as a HTML tree
__content = None
# List of paragraphs, each of an Element
__paragraphs = []
# Paragraph title
title = ""
def __init__(self, epub, idref, play_order, contentfile, title=""):
self.idref = idref
self.play_order = play_order
self.contentfile = contentfile
if title:
self.title = title
self.__content = html.parse(epub.open(contentfile))
def __parse_paragraphs(self):
'''Parse the Chapter's contents to derive a list of text paragraphs making up the Chapter.'''
cr = self.__content.getroot() # Get the content root Element
self.__paragraphs = cr.cssselect("p") # Select all the paragraphs via <p> tag
# Attempt to derive the title of the Chapter, most often found in the first <h2> tag
# It's a rough heuristic to determine the chapter title, but hey, something is better than nothing
try:
if self.title == "":
self.title = cr.cssselect("h2")[0].text_content()
except IndexError:
pass # Oh well, looks like trying to determine the chapter title failed
def get_original_text(self):
'''Returns the original HTML content of a Chapter'''
if self.__content is not None:
return html.tostring(self.__content)
else:
return ""
def get_paragraph_text(self, full_body=False):
'''Returns a long string comprising the HTML making up the paragraphs of the chapter, wrapped in <p> tags, delineated by newline characters.'''
out = []
if not self.__paragraphs:
self.__parse_paragraphs()
for p in self.__paragraphs:
out.append(html.tostring(p))
fragment = html.fragment_fromstring(u"\n".join(out), create_parent='div') # Wrap the <p>'s in a <div>
if full_body:
return html.tostring(html.document_fromstring(html.tostring(fragment)))
else:
return html.tostring(fragment)