Skip to content

Commit 1167c49

Browse files
committed
Allow parsing of ALTO files into bounding box format
Add an option to XMLPage to return BBoxLine objects instead of BaselineLine.
1 parent 14778a4 commit 1167c49

File tree

1 file changed

+71
-38
lines changed

1 file changed

+71
-38
lines changed

kraken/lib/xml.py

Lines changed: 71 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
from lxml import etree
2727

28-
from kraken.containers import BaselineLine, Region, Segmentation
28+
from kraken.containers import BBoxLine, BaselineLine, Region, Segmentation
2929

3030
logger = logging.getLogger(__name__)
3131

@@ -60,7 +60,17 @@
6060

6161

6262
class XMLPage(object):
63+
"""
64+
Parses XML facsimiles in ALTO or PageXML format.
6365
66+
The parser is able to deal with most (but not all) features supported by
67+
those standards. In particular, any data below the line level is discarded.
68+
69+
Args:
70+
filename: Path to the XML file
71+
filetype: Selector for explicit subparser choice.
72+
linetype: Parse line data as baselines or bounding box type.
73+
"""
6474
type: Literal['baselines', 'bbox'] = 'baselines'
6575
base_dir: Optional[Literal['L', 'R']] = None
6676
imagename: 'PathLike' = None
@@ -73,10 +83,12 @@ class XMLPage(object):
7383

7484
def __init__(self,
7585
filename: Union[str, 'PathLike'],
76-
filetype: Literal['xml', 'alto', 'page'] = 'xml'):
86+
filetype: Literal['xml', 'alto', 'page'] = 'xml',
87+
linetype: Literal['baselines', 'bbox'] = 'baselines'):
7788
super().__init__()
7889
self.filename = Path(filename)
7990
self.filetype = filetype
91+
self.type = linetype
8092

8193
self._regions = {}
8294
self._lines = {}
@@ -150,17 +162,17 @@ def _parse_alto(self):
150162
boundary = None
151163
if coords is not None:
152164
boundary = self._parse_alto_pointstype(coords.get('POINTS'))
153-
elif (region.get('HPOS') is not None and region.get('VPOS') is not None and
154-
region.get('WIDTH') is not None and region.get('HEIGHT') is not None):
165+
else:
166+
reg_pos = region.get('HPOS'), region.get('VPOS'), region.get('WIDTH'), region.get('HEIGHT')
167+
try:
168+
x_min, y_min, width, height = map(int, map(float, reg_pos))
169+
boundary = [(x_min, y_min),
170+
(x_min, y_min + height),
171+
(x_min + width, y_min + height),
172+
(x_min + width, y_min)]
173+
except ValueError:
174+
pass
155175
# use rectangular definition
156-
x_min = int(float(region.get('HPOS')))
157-
y_min = int(float(region.get('VPOS')))
158-
width = int(float(region.get('WIDTH')))
159-
height = int(float(region.get('HEIGHT')))
160-
boundary = [(x_min, y_min),
161-
(x_min, y_min + height),
162-
(x_min + width, y_min + height),
163-
(x_min + width, y_min)]
164176
rtype = region.get('TYPE')
165177
# fall back to default region type if nothing is given
166178
tagrefs = region.get('TAGREFS')
@@ -177,24 +189,34 @@ def _parse_alto(self):
177189

178190
# parse lines in region
179191
for line in region.iterfind('./{*}TextLine'):
180-
if line.get('BASELINE') is None:
181-
logger.info('TextLine {} without baseline'.format(line.get('ID')))
182-
continue
183-
pol = line.find('./{*}Shape/{*}Polygon')
184-
boundary = None
185-
if pol is not None:
192+
line_id = line.get('ID')
193+
if self.type == 'baselines':
194+
if line.get('BASELINE') is None:
195+
logger.info(f'TextLine {line_id} without baseline')
196+
continue
197+
pol = line.find('./{*}Shape/{*}Polygon')
198+
boundary = None
199+
if pol is not None:
200+
try:
201+
boundary = self._parse_alto_pointstype(pol.get('POINTS'))
202+
except ValueError:
203+
logger.info(f'TextLine {line_id} without polygon')
204+
else:
205+
logger.info(f'TextLine {line_id} without polygon')
206+
207+
baseline = None
186208
try:
187-
boundary = self._parse_alto_pointstype(pol.get('POINTS'))
209+
baseline = self._parse_alto_pointstype(line.get('BASELINE'))
188210
except ValueError:
189-
logger.info('TextLine {} without polygon'.format(line.get('ID')))
190-
else:
191-
logger.info('TextLine {} without polygon'.format(line.get('ID')))
192-
193-
baseline = None
194-
try:
195-
baseline = self._parse_alto_pointstype(line.get('BASELINE'))
196-
except ValueError:
197-
logger.info('TextLine {} without baseline'.format(line.get('ID')))
211+
logger.info(f'TextLine {line_id} without baseline')
212+
elif self.type == 'bbox':
213+
line_pos = line.get('HPOS'), line.get('VPOS'), line.get('WIDTH'), line.get('HEIGHT')
214+
try:
215+
x_min, y_min, width, height = map(int, map(float, line_pos))
216+
bbox = (x_min, y_min, x_min+width, y_min+height)
217+
except ValueError:
218+
logger.info(f'TextLine {line_id} without complete bounding box data')
219+
continue
198220

199221
text = ''
200222
for el in line.xpath(".//*[local-name() = 'String'] | .//*[local-name() = 'SP']"):
@@ -214,15 +236,26 @@ def _parse_alto(self):
214236
tags[ttype] = ltype
215237
if ltype in ['train', 'validation', 'test']:
216238
split_type = ltype
217-
self._lines[line.get('ID')] = BaselineLine(id=line.get('ID'),
218-
baseline=baseline,
219-
boundary=boundary,
220-
text=text,
221-
tags=tags,
222-
split=split_type,
223-
regions=[region_id])
239+
240+
if self.type == 'baselines':
241+
line_obj = BaselineLine(id=line_id,
242+
baseline=baseline,
243+
boundary=boundary,
244+
text=text,
245+
tags=tags,
246+
split=split_type,
247+
regions=[region_id])
248+
elif self.type == 'bbox':
249+
line_obj = BBoxLine(id=line_id,
250+
bbox=bbox,
251+
text=text,
252+
tags=tags,
253+
split=split_type,
254+
regions=[region_id])
255+
256+
self._lines[line_id] = line_obj
224257
# register implicit reading order
225-
self._orders['line_implicit']['order'].append(line.get('ID'))
258+
self._orders['line_implicit']['order'].append(line_id)
226259

227260
self._regions = region_data
228261

@@ -593,10 +626,10 @@ def to_container(self) -> Segmentation:
593626
"""
594627
Returns a Segmentation object.
595628
"""
596-
return Segmentation(type='baselines',
629+
return Segmentation(type=self.type,
597630
imagename=self.imagename,
598631
text_direction='horizontal_lr',
599632
script_detection=True,
600633
lines=self.get_sorted_lines(),
601634
regions=self._regions,
602-
line_orders=[])
635+
line_orders=list(self.reading_orders.values()))

0 commit comments

Comments
 (0)