25
25
26
26
from lxml import etree
27
27
28
- from kraken .containers import BaselineLine , Region , Segmentation
28
+ from kraken .containers import BBoxLine , BaselineLine , Region , Segmentation
29
29
30
30
logger = logging .getLogger (__name__ )
31
31
60
60
61
61
62
62
class XMLPage (object ):
63
+ """
64
+ Parses XML facsimiles in ALTO or PageXML format.
63
65
66
+ The parser is able to deal with most (but not all) features supported by
67
+ those standards. In particular, any data below the line level is discarded.
68
+
69
+ Args:
70
+ filename: Path to the XML file
71
+ filetype: Selector for explicit subparser choice.
72
+ linetype: Parse line data as baselines or bounding box type.
73
+ """
64
74
type : Literal ['baselines' , 'bbox' ] = 'baselines'
65
75
base_dir : Optional [Literal ['L' , 'R' ]] = None
66
76
imagename : 'PathLike' = None
@@ -73,10 +83,12 @@ class XMLPage(object):
73
83
74
84
def __init__ (self ,
75
85
filename : Union [str , 'PathLike' ],
76
- filetype : Literal ['xml' , 'alto' , 'page' ] = 'xml' ):
86
+ filetype : Literal ['xml' , 'alto' , 'page' ] = 'xml' ,
87
+ linetype : Literal ['baselines' , 'bbox' ] = 'baselines' ):
77
88
super ().__init__ ()
78
89
self .filename = Path (filename )
79
90
self .filetype = filetype
91
+ self .type = linetype
80
92
81
93
self ._regions = {}
82
94
self ._lines = {}
@@ -150,17 +162,17 @@ def _parse_alto(self):
150
162
boundary = None
151
163
if coords is not None :
152
164
boundary = self ._parse_alto_pointstype (coords .get ('POINTS' ))
153
- elif (region .get ('HPOS' ) is not None and region .get ('VPOS' ) is not None and
154
- region .get ('WIDTH' ) is not None and region .get ('HEIGHT' ) is not None ):
165
+ else :
166
+ reg_pos = region .get ('HPOS' ), region .get ('VPOS' ), region .get ('WIDTH' ), region .get ('HEIGHT' )
167
+ try :
168
+ x_min , y_min , width , height = map (int , map (float , reg_pos ))
169
+ boundary = [(x_min , y_min ),
170
+ (x_min , y_min + height ),
171
+ (x_min + width , y_min + height ),
172
+ (x_min + width , y_min )]
173
+ except ValueError :
174
+ pass
155
175
# use rectangular definition
156
- x_min = int (float (region .get ('HPOS' )))
157
- y_min = int (float (region .get ('VPOS' )))
158
- width = int (float (region .get ('WIDTH' )))
159
- height = int (float (region .get ('HEIGHT' )))
160
- boundary = [(x_min , y_min ),
161
- (x_min , y_min + height ),
162
- (x_min + width , y_min + height ),
163
- (x_min + width , y_min )]
164
176
rtype = region .get ('TYPE' )
165
177
# fall back to default region type if nothing is given
166
178
tagrefs = region .get ('TAGREFS' )
@@ -177,24 +189,34 @@ def _parse_alto(self):
177
189
178
190
# parse lines in region
179
191
for line in region .iterfind ('./{*}TextLine' ):
180
- if line .get ('BASELINE' ) is None :
181
- logger .info ('TextLine {} without baseline' .format (line .get ('ID' )))
182
- continue
183
- pol = line .find ('./{*}Shape/{*}Polygon' )
184
- boundary = None
185
- if pol is not None :
192
+ line_id = line .get ('ID' )
193
+ if self .type == 'baselines' :
194
+ if line .get ('BASELINE' ) is None :
195
+ logger .info (f'TextLine { line_id } without baseline' )
196
+ continue
197
+ pol = line .find ('./{*}Shape/{*}Polygon' )
198
+ boundary = None
199
+ if pol is not None :
200
+ try :
201
+ boundary = self ._parse_alto_pointstype (pol .get ('POINTS' ))
202
+ except ValueError :
203
+ logger .info (f'TextLine { line_id } without polygon' )
204
+ else :
205
+ logger .info (f'TextLine { line_id } without polygon' )
206
+
207
+ baseline = None
186
208
try :
187
- boundary = self ._parse_alto_pointstype (pol .get ('POINTS ' ))
209
+ baseline = self ._parse_alto_pointstype (line .get ('BASELINE ' ))
188
210
except ValueError :
189
- logger .info ('TextLine {} without polygon' . format ( line . get ( 'ID' )) )
190
- else :
191
- logger . info ( 'TextLine {} without polygon' . format ( line .get ('ID' )) )
192
-
193
- baseline = None
194
- try :
195
- baseline = self . _parse_alto_pointstype ( line . get ( 'BASELINE' ))
196
- except ValueError :
197
- logger . info ( 'TextLine {} without baseline' . format ( line . get ( 'ID' )))
211
+ logger .info (f 'TextLine { line_id } without baseline' )
212
+ elif self . type == 'bbox' :
213
+ line_pos = line . get ( 'HPOS' ), line . get ( 'VPOS' ), line .get ('WIDTH' ), line . get ( 'HEIGHT' )
214
+ try :
215
+ x_min , y_min , width , height = map ( int , map ( float , line_pos ))
216
+ bbox = ( x_min , y_min , x_min + width , y_min + height )
217
+ except ValueError :
218
+ logger . info ( f'TextLine { line_id } without complete bounding box data' )
219
+ continue
198
220
199
221
text = ''
200
222
for el in line .xpath (".//*[local-name() = 'String'] | .//*[local-name() = 'SP']" ):
@@ -214,15 +236,26 @@ def _parse_alto(self):
214
236
tags [ttype ] = ltype
215
237
if ltype in ['train' , 'validation' , 'test' ]:
216
238
split_type = ltype
217
- self ._lines [line .get ('ID' )] = BaselineLine (id = line .get ('ID' ),
218
- baseline = baseline ,
219
- boundary = boundary ,
220
- text = text ,
221
- tags = tags ,
222
- split = split_type ,
223
- regions = [region_id ])
239
+
240
+ if self .type == 'baselines' :
241
+ line_obj = BaselineLine (id = line_id ,
242
+ baseline = baseline ,
243
+ boundary = boundary ,
244
+ text = text ,
245
+ tags = tags ,
246
+ split = split_type ,
247
+ regions = [region_id ])
248
+ elif self .type == 'bbox' :
249
+ line_obj = BBoxLine (id = line_id ,
250
+ bbox = bbox ,
251
+ text = text ,
252
+ tags = tags ,
253
+ split = split_type ,
254
+ regions = [region_id ])
255
+
256
+ self ._lines [line_id ] = line_obj
224
257
# register implicit reading order
225
- self ._orders ['line_implicit' ]['order' ].append (line . get ( 'ID' ) )
258
+ self ._orders ['line_implicit' ]['order' ].append (line_id )
226
259
227
260
self ._regions = region_data
228
261
@@ -593,10 +626,10 @@ def to_container(self) -> Segmentation:
593
626
"""
594
627
Returns a Segmentation object.
595
628
"""
596
- return Segmentation (type = 'baselines' ,
629
+ return Segmentation (type = self . type ,
597
630
imagename = self .imagename ,
598
631
text_direction = 'horizontal_lr' ,
599
632
script_detection = True ,
600
633
lines = self .get_sorted_lines (),
601
634
regions = self ._regions ,
602
- line_orders = [] )
635
+ line_orders = list ( self . reading_orders . values ()) )
0 commit comments