1
2
3
4 """
5 This Module contains several convenient parsers and fonctions to easily parse
6 formatted text into values usable with python of Numpy fonctions.
7
8 The main function is parseConfigFile which allow to read a text file
9 structured in sections and convert it into a python object with the same
10 structure but where information is automatically converted into
11 python or numpy objects
12 (see http://www.scipy.org/Cookbook/Reading_Custom_Text_Files_with_Pyparsing)
13
14
15 == Parser definitions ==
16
17 * number : match string representing integers or floats and return the
18 corresponding python object
19
20 * pyValue : match string representing number, None, True, False, NaN or
21 quoted strings and return the corresponding python object
22
23
24 == Parser generators ==
25
26 * variableParser : create a parser to match variable names and return a cleaned version of it
27
28 * paramParser : create a parser to match a set of parameter definitions
29
30 * tableColParser : create a parser to match a table, defined column by column
31
32 * tableRowParser : create a parser to match a table, defined row by row
33
34 * matrixParser : create a parser to match a matrix
35
36 See the corresponding docstring for more information and parseConfigFile for an
37 example of utilisation.
38 """
39 from pyparsing import *
40 from numpy import array, NAN
41 from re import VERBOSE
42
43 __version__ = '0.1.1'
44 __all__ = '''number pyValue matrixParser tableRowParser
45 tableColParser paramParser variableParser
46 parseConfigFile
47 '''.split()
48
49
50
51
52 def variableParser(escapedChars, baseChars=alphanums):
53 """ Return a parser matching any characters in baseChars separated by
54 characters defined in escapedChars. Thoses characters are replaced with '_'
55
56 The '_' character is therefore automatically in escapedChars.
57 """
58 escapeDef = Word(escapedChars + '_').setParseAction(replaceWith('_'))
59 whitespaceChars = ''.join( x for x in ' \t\r' if not x in escapedChars )
60 escapeDef = escapeDef.setWhitespaceChars(whitespaceChars)
61 return Combine(Word(baseChars) + Optional(OneOrMore(escapeDef + Word(baseChars))))
62
63 def convertNumber(t):
64 """Convert a string matching a number to a python number"""
65 if t.float1 or t.float2 or t.float3 : return [float(t[0])]
66 else : return [int(t[0]) ]
67
68
69 number = Regex(r"""
70 [+-]? # optional sign
71 (
72 (?:\d+(?P<float1>\.\d*)?) # match 2 or 2.02
73 | # or
74 (?P<float2>\.\d+) # match .02
75 )
76 (?P<float3>[Ee][+-]?\d+)? # optional exponent
77 """, flags=VERBOSE
78 )
79 number.setParseAction(convertNumber)
80
81 pyValue_list = [ number ,
82 Keyword('True').setParseAction(replaceWith(True)) ,
83 Keyword('False').setParseAction(replaceWith(False)) ,
84 Keyword('NAN', caseless=True).setParseAction(replaceWith(NAN)),
85 Keyword('None').setParseAction(replaceWith(None)) ,
86 QuotedString('"""', multiline=True) ,
87 QuotedString("'''", multiline=True) ,
88 QuotedString('"') ,
89 QuotedString("'") ,
90 ]
91
92
93 EOL = LineEnd().suppress()
94 pyValue = MatchFirst( e.setWhitespaceChars(' \t\r') for e in pyValue_list)
95 unitDef = Suppress('(') + (Suppress(oneOf('- /')) | Optional(Word(alphanums + '^*/-._'))) + Suppress(')')
96 keyName = variableParser(' _-./').setParseAction(downcaseTokens)
97 keyNameWithoutSpace = variableParser('_-./').setParseAction(downcaseTokens)
98
99
100
101
102 def paramParser(comment='#'):
103 """ Create a pattern matching any definition of parameters with the form
104
105 variable_name (unit) = value (unit is optional)
106
107 Variable names can have spaces in them or any characters in '_-./' but
108 theses characters are replaced with '_' and the resulting variable name
109 will be cast to lowercase.
110
111 Value can be any standard python value (int, number, None, False, True, NaN
112 or quoted strings) or a raw string, which can be multiline if additional
113 lines start with a whitespace.
114
115 Return a Dict element to allow accessing data using the varible name as a key.
116
117 This Dict has two special fields :
118 names_ : the list of column names found
119 units_ : a dict in the form {key : unit}
120 """
121
122 def formatBloc(t):
123 """ Format the result to have a list of (key, values) easily usable with Dict
124
125 Add two fields :
126 names_ : the list of column names found
127 units_ : a dict in the form {key : unit}
128 """
129 rows = []
130
131
132 units = {}
133 names = []
134
135 for row in t :
136 rows.append(ParseResults([ row.name, row.value ]))
137 names.append(row.name)
138 if row.unit : units[row.name] = row.unit[0]
139
140 rows.append( ParseResults([ 'names_', names ]))
141 rows.append( ParseResults([ 'unit_', units]))
142
143 return rows
144
145
146 rawLine = CharsNotIn(comment + '\n') + (lineEnd | Suppress(comment+restOfLine))
147 rawValue = Combine( rawLine + ZeroOrMore(White(' \t').suppress()+ NotAny('[') + rawLine))
148 rawValue.setParseAction(lambda t: [x.strip() for x in t])
149
150 valueDef = pyValue | rawValue
151 paramDef = keyName('name') + Optional(unitDef)('unit') + Suppress("="+empty) + valueDef('value')
152 paramBloc = OneOrMore( Group(paramDef)).setParseAction(formatBloc)
153
154 return Dict(paramBloc)
155
156
157
158
159 def tableColParser():
160 """ Define a pattern matching a table described in columns according to this schema :
161 Name_1 Name_2 ... Name_n
162 (unit_1) (unit_2) ... (unit_n)
163 value_11 value_21 ... value_n1
164 ... ... ... ...
165
166 Names can't contains any whitespaces.
167 Units are mandatory.
168
169 Value can be any standard python value (int, number, None, False, True, NaN
170 or quoted strings) or a raw string which can't contains spaces or '['.
171
172 Return a Dict element to allow accessing data using the column name as a key.
173
174 This Dict has two special fields :
175 names_ : the list of column names found
176 units_ : a dict in the form {key : unit}
177 """
178
179 def formatBloc(t):
180 """ Format the result to have a list of (key, values) easily usable
181 with Dict and transform data into array
182
183 Add two fields :
184 names_ : the list of column names found
185 units_ : a dict in the form {key : unit}
186 """
187 columns = []
188
189
190 names = t.header
191 units = {}
192
193 transposedData = zip(*t.data)
194 for header, unit, data in zip(t.header, t.unit, transposedData):
195 units[header] = unit
196 columns.append(ParseResults([header, array(data)]))
197
198 columns.append(ParseResults(['names_', names]))
199 columns.append(ParseResults(['unit_' , units ]))
200
201 return columns
202
203 def defineColNumber(t):
204 """ Define unitLine and tabValueLine to match the same number of row than
205 in header"""
206 nbcols = len(t.header)
207 unitLine << Group( unitDef*nbcols + EOL)
208 tabValueLine << Group( tabValueDef*nbcols + EOL)
209
210 tabValueDef = pyValue | CharsNotIn('[ \t\r\n').setWhitespaceChars(" \t")
211 firstLine = Group(OneOrMore(keyNameWithoutSpace)+EOL)
212 unitLine = Forward()
213 tabValueLine = Forward()
214
215 tableCol = ( firstLine('header').setParseAction(defineColNumber)
216 + unitLine('unit')
217 + Group(OneOrMore(tabValueLine))('data')
218 ).setParseAction(formatBloc)
219
220 return Dict(tableCol)
221
222
223
224 def matrixParser(pattern=pyValue):
225 """ Return a pattern matching a matrix containing only element matching pattern"""
226
227 def formatBloc(t):
228 'return an array object'
229 return [array(t.asList())]
230
231 def defineColNumber(t):
232 """ define matrixLine to match the same number of col than t has """
233 nbcols = len(t[0])
234 matrixLine << Group( pattern*nbcols + EOL)
235
236 firstLine = Group( OneOrMore(pattern) + EOL).setParseAction(defineColNumber)
237 matrixLine = Forward()
238 matrixDef = (firstLine + OneOrMore(matrixLine)).setParseAction(formatBloc)
239
240 return matrixDef
241
242
243 def tableRowParser():
244 """ Define a pattern matching a table described in row following the schema :
245
246 Name_1 (unit) value_11 value_12 ... value_1n
247 Name_2 (unit) value_21 value_22 ... value_2n
248 ... ... ... ... ... ...
249
250 Units are optional.
251 Name can contains spaces if theyt are followed by an unit, otherwise, they can't.
252
253 Value can be any standard python value (int, number, None, False, True, NaN
254 or quoted strings) or a raw string which can't contains spaces or '['.
255
256 Return a Dict element to allow accessing data using the column name as a key.
257
258 This Dict has two special fields :
259 names_ : the list of row names found
260 units_ : a dict in the form {key : unit}
261 """
262
263 def formatBloc(t):
264 """ Format the result to have a list of (key, values) easily usable with Dict
265 and transform values into array
266
267 Add two fields :
268 names_ : the list of row names found
269 units_ : a dict in the form {key : unit}
270 """
271 rows = []
272
273
274 units = {}
275 names = []
276
277 for row in t :
278 rows.append(ParseResults([ row.header, array(tuple(row.value)) ]))
279 names.append(row.header)
280 if row.unit : units[row.header] = row.unit[0]
281
282 rows.append( ParseResults([ 'names_', names ]))
283 rows.append( ParseResults([ 'unit_', units]))
284
285 return rows
286
287 def defineColNumber(t):
288 """ Define unitLine and tabValueLine to match the same number of columns than
289 the first line had"""
290 nbcols = len(t[0].value)
291 tabValueLine << Group(rowHeader + Group(tabValueDef*nbcols)('value') + EOL)
292
293
294 tabValueDef = pyValue | CharsNotIn('[ \t\r\n').setWhitespaceChars(" \t")
295 rowHeader = (keyName("header") + unitDef('unit')) | keyNameWithoutSpace('header')
296 firstLine = Group(rowHeader + Group(OneOrMore(tabValueDef))('value') + EOL).setParseAction(defineColNumber)
297 tabValueLine = Forward()
298
299 tableRowDef = (firstLine + OneOrMore(tabValueLine)).setParseAction(formatBloc)
300
301 return Dict(tableRowDef)
302
303
304
305 def test_pattern(pattern, fname='data.txt'):
306 """ A simple function to test a ParserElement"""
307 for r, s, t, in pattern.scanString(file(fname).read()):
308 print 'found : ', r
309
310
311
312 def parseConfigFile(fname):
313 """ Parse a file structured in section according to the schema
314 [ section Name ]
315 <bloc>
316 ....
317
318 where <bloc> can be any text matching one of elements created by :
319
320 * paramParser <=> a set of variable definitions
321 * tableColParser <=> a table defined column by column
322 * tableRowParser <=> a table defined row by row
323 * matrixParser <=> a Matrix containg only python values or NaN
324
325 Any text after the character # is considered as a comment and is ignored
326
327 Return a Dict element to allow accessing bloc using the section name as a key.
328 """
329
330
331 sectionName = Suppress('[') + keyName + Suppress(']')
332
333
334 section = Group (sectionName +
335 ( paramParser()
336 | tableColParser()
337 | tableRowParser()
338 | matrixParser()
339 ) )
340
341
342 parser = Dict( OneOrMore( section | Suppress(sectionName) ))
343
344
345 parser.ignore('#' + restOfLine)
346
347
348 try :
349 return parser.parseFile(fname, parseAll=True)
350
351 except ParseException, pe:
352
353 msg = "ERROR during parsing of %s, line %d:" % (fname, pe.lineno)
354 msg += '\n' + '-'*40 + '\n'
355 msg += pe.line + '\n'
356 msg += ' '*(pe.col-1) + '^\n'
357 msg += '-'*40 + '\n' + pe.msg
358 pe.msg = msg
359 raise
360
361 if __name__ == '__main__' :
362
363 from sys import argv
364 from pprint import pprint
365
366 if len(argv) < 2 : fname = 'data.txt'
367 else : fname = argv[1]
368
369 data = parseConfigFile(fname)
370 pprint(data.asList())
371
372