Chris@87
|
1 """A collection of functions designed to help I/O with ascii files.
|
Chris@87
|
2
|
Chris@87
|
3 """
|
Chris@87
|
4 from __future__ import division, absolute_import, print_function
|
Chris@87
|
5
|
Chris@87
|
6 __docformat__ = "restructuredtext en"
|
Chris@87
|
7
|
Chris@87
|
8 import sys
|
Chris@87
|
9 import numpy as np
|
Chris@87
|
10 import numpy.core.numeric as nx
|
Chris@87
|
11 from numpy.compat import asbytes, bytes, asbytes_nested, basestring
|
Chris@87
|
12
|
Chris@87
|
13 if sys.version_info[0] >= 3:
|
Chris@87
|
14 from builtins import bool, int, float, complex, object, str
|
Chris@87
|
15 unicode = str
|
Chris@87
|
16 else:
|
Chris@87
|
17 from __builtin__ import bool, int, float, complex, object, unicode, str
|
Chris@87
|
18
|
Chris@87
|
19
|
Chris@87
|
20 if sys.version_info[0] >= 3:
|
Chris@87
|
21 def _bytes_to_complex(s):
|
Chris@87
|
22 return complex(s.decode('ascii'))
|
Chris@87
|
23
|
Chris@87
|
24 def _bytes_to_name(s):
|
Chris@87
|
25 return s.decode('ascii')
|
Chris@87
|
26 else:
|
Chris@87
|
27 _bytes_to_complex = complex
|
Chris@87
|
28 _bytes_to_name = str
|
Chris@87
|
29
|
Chris@87
|
30 def _is_string_like(obj):
|
Chris@87
|
31 """
|
Chris@87
|
32 Check whether obj behaves like a string.
|
Chris@87
|
33 """
|
Chris@87
|
34 try:
|
Chris@87
|
35 obj + ''
|
Chris@87
|
36 except (TypeError, ValueError):
|
Chris@87
|
37 return False
|
Chris@87
|
38 return True
|
Chris@87
|
39
|
Chris@87
|
40 def _is_bytes_like(obj):
|
Chris@87
|
41 """
|
Chris@87
|
42 Check whether obj behaves like a bytes object.
|
Chris@87
|
43 """
|
Chris@87
|
44 try:
|
Chris@87
|
45 obj + asbytes('')
|
Chris@87
|
46 except (TypeError, ValueError):
|
Chris@87
|
47 return False
|
Chris@87
|
48 return True
|
Chris@87
|
49
|
Chris@87
|
50
|
Chris@87
|
51 def _to_filehandle(fname, flag='r', return_opened=False):
|
Chris@87
|
52 """
|
Chris@87
|
53 Returns the filehandle corresponding to a string or a file.
|
Chris@87
|
54 If the string ends in '.gz', the file is automatically unzipped.
|
Chris@87
|
55
|
Chris@87
|
56 Parameters
|
Chris@87
|
57 ----------
|
Chris@87
|
58 fname : string, filehandle
|
Chris@87
|
59 Name of the file whose filehandle must be returned.
|
Chris@87
|
60 flag : string, optional
|
Chris@87
|
61 Flag indicating the status of the file ('r' for read, 'w' for write).
|
Chris@87
|
62 return_opened : boolean, optional
|
Chris@87
|
63 Whether to return the opening status of the file.
|
Chris@87
|
64 """
|
Chris@87
|
65 if _is_string_like(fname):
|
Chris@87
|
66 if fname.endswith('.gz'):
|
Chris@87
|
67 import gzip
|
Chris@87
|
68 fhd = gzip.open(fname, flag)
|
Chris@87
|
69 elif fname.endswith('.bz2'):
|
Chris@87
|
70 import bz2
|
Chris@87
|
71 fhd = bz2.BZ2File(fname)
|
Chris@87
|
72 else:
|
Chris@87
|
73 fhd = file(fname, flag)
|
Chris@87
|
74 opened = True
|
Chris@87
|
75 elif hasattr(fname, 'seek'):
|
Chris@87
|
76 fhd = fname
|
Chris@87
|
77 opened = False
|
Chris@87
|
78 else:
|
Chris@87
|
79 raise ValueError('fname must be a string or file handle')
|
Chris@87
|
80 if return_opened:
|
Chris@87
|
81 return fhd, opened
|
Chris@87
|
82 return fhd
|
Chris@87
|
83
|
Chris@87
|
84
|
Chris@87
|
85 def has_nested_fields(ndtype):
|
Chris@87
|
86 """
|
Chris@87
|
87 Returns whether one or several fields of a dtype are nested.
|
Chris@87
|
88
|
Chris@87
|
89 Parameters
|
Chris@87
|
90 ----------
|
Chris@87
|
91 ndtype : dtype
|
Chris@87
|
92 Data-type of a structured array.
|
Chris@87
|
93
|
Chris@87
|
94 Raises
|
Chris@87
|
95 ------
|
Chris@87
|
96 AttributeError
|
Chris@87
|
97 If `ndtype` does not have a `names` attribute.
|
Chris@87
|
98
|
Chris@87
|
99 Examples
|
Chris@87
|
100 --------
|
Chris@87
|
101 >>> dt = np.dtype([('name', 'S4'), ('x', float), ('y', float)])
|
Chris@87
|
102 >>> np.lib._iotools.has_nested_fields(dt)
|
Chris@87
|
103 False
|
Chris@87
|
104
|
Chris@87
|
105 """
|
Chris@87
|
106 for name in ndtype.names or ():
|
Chris@87
|
107 if ndtype[name].names:
|
Chris@87
|
108 return True
|
Chris@87
|
109 return False
|
Chris@87
|
110
|
Chris@87
|
111
|
Chris@87
|
112 def flatten_dtype(ndtype, flatten_base=False):
|
Chris@87
|
113 """
|
Chris@87
|
114 Unpack a structured data-type by collapsing nested fields and/or fields
|
Chris@87
|
115 with a shape.
|
Chris@87
|
116
|
Chris@87
|
117 Note that the field names are lost.
|
Chris@87
|
118
|
Chris@87
|
119 Parameters
|
Chris@87
|
120 ----------
|
Chris@87
|
121 ndtype : dtype
|
Chris@87
|
122 The datatype to collapse
|
Chris@87
|
123 flatten_base : {False, True}, optional
|
Chris@87
|
124 Whether to transform a field with a shape into several fields or not.
|
Chris@87
|
125
|
Chris@87
|
126 Examples
|
Chris@87
|
127 --------
|
Chris@87
|
128 >>> dt = np.dtype([('name', 'S4'), ('x', float), ('y', float),
|
Chris@87
|
129 ... ('block', int, (2, 3))])
|
Chris@87
|
130 >>> np.lib._iotools.flatten_dtype(dt)
|
Chris@87
|
131 [dtype('|S4'), dtype('float64'), dtype('float64'), dtype('int32')]
|
Chris@87
|
132 >>> np.lib._iotools.flatten_dtype(dt, flatten_base=True)
|
Chris@87
|
133 [dtype('|S4'), dtype('float64'), dtype('float64'), dtype('int32'),
|
Chris@87
|
134 dtype('int32'), dtype('int32'), dtype('int32'), dtype('int32'),
|
Chris@87
|
135 dtype('int32')]
|
Chris@87
|
136
|
Chris@87
|
137 """
|
Chris@87
|
138 names = ndtype.names
|
Chris@87
|
139 if names is None:
|
Chris@87
|
140 if flatten_base:
|
Chris@87
|
141 return [ndtype.base] * int(np.prod(ndtype.shape))
|
Chris@87
|
142 return [ndtype.base]
|
Chris@87
|
143 else:
|
Chris@87
|
144 types = []
|
Chris@87
|
145 for field in names:
|
Chris@87
|
146 info = ndtype.fields[field]
|
Chris@87
|
147 flat_dt = flatten_dtype(info[0], flatten_base)
|
Chris@87
|
148 types.extend(flat_dt)
|
Chris@87
|
149 return types
|
Chris@87
|
150
|
Chris@87
|
151
|
Chris@87
|
152 class LineSplitter(object):
|
Chris@87
|
153 """
|
Chris@87
|
154 Object to split a string at a given delimiter or at given places.
|
Chris@87
|
155
|
Chris@87
|
156 Parameters
|
Chris@87
|
157 ----------
|
Chris@87
|
158 delimiter : str, int, or sequence of ints, optional
|
Chris@87
|
159 If a string, character used to delimit consecutive fields.
|
Chris@87
|
160 If an integer or a sequence of integers, width(s) of each field.
|
Chris@87
|
161 comment : str, optional
|
Chris@87
|
162 Character used to mark the beginning of a comment. Default is '#'.
|
Chris@87
|
163 autostrip : bool, optional
|
Chris@87
|
164 Whether to strip each individual field. Default is True.
|
Chris@87
|
165
|
Chris@87
|
166 """
|
Chris@87
|
167
|
Chris@87
|
168 def autostrip(self, method):
|
Chris@87
|
169 """
|
Chris@87
|
170 Wrapper to strip each member of the output of `method`.
|
Chris@87
|
171
|
Chris@87
|
172 Parameters
|
Chris@87
|
173 ----------
|
Chris@87
|
174 method : function
|
Chris@87
|
175 Function that takes a single argument and returns a sequence of
|
Chris@87
|
176 strings.
|
Chris@87
|
177
|
Chris@87
|
178 Returns
|
Chris@87
|
179 -------
|
Chris@87
|
180 wrapped : function
|
Chris@87
|
181 The result of wrapping `method`. `wrapped` takes a single input
|
Chris@87
|
182 argument and returns a list of strings that are stripped of
|
Chris@87
|
183 white-space.
|
Chris@87
|
184
|
Chris@87
|
185 """
|
Chris@87
|
186 return lambda input: [_.strip() for _ in method(input)]
|
Chris@87
|
187 #
|
Chris@87
|
188
|
Chris@87
|
189 def __init__(self, delimiter=None, comments=asbytes('#'), autostrip=True):
|
Chris@87
|
190 self.comments = comments
|
Chris@87
|
191 # Delimiter is a character
|
Chris@87
|
192 if isinstance(delimiter, unicode):
|
Chris@87
|
193 delimiter = delimiter.encode('ascii')
|
Chris@87
|
194 if (delimiter is None) or _is_bytes_like(delimiter):
|
Chris@87
|
195 delimiter = delimiter or None
|
Chris@87
|
196 _handyman = self._delimited_splitter
|
Chris@87
|
197 # Delimiter is a list of field widths
|
Chris@87
|
198 elif hasattr(delimiter, '__iter__'):
|
Chris@87
|
199 _handyman = self._variablewidth_splitter
|
Chris@87
|
200 idx = np.cumsum([0] + list(delimiter))
|
Chris@87
|
201 delimiter = [slice(i, j) for (i, j) in zip(idx[:-1], idx[1:])]
|
Chris@87
|
202 # Delimiter is a single integer
|
Chris@87
|
203 elif int(delimiter):
|
Chris@87
|
204 (_handyman, delimiter) = (
|
Chris@87
|
205 self._fixedwidth_splitter, int(delimiter))
|
Chris@87
|
206 else:
|
Chris@87
|
207 (_handyman, delimiter) = (self._delimited_splitter, None)
|
Chris@87
|
208 self.delimiter = delimiter
|
Chris@87
|
209 if autostrip:
|
Chris@87
|
210 self._handyman = self.autostrip(_handyman)
|
Chris@87
|
211 else:
|
Chris@87
|
212 self._handyman = _handyman
|
Chris@87
|
213 #
|
Chris@87
|
214
|
Chris@87
|
215 def _delimited_splitter(self, line):
|
Chris@87
|
216 if self.comments is not None:
|
Chris@87
|
217 line = line.split(self.comments)[0]
|
Chris@87
|
218 line = line.strip(asbytes(" \r\n"))
|
Chris@87
|
219 if not line:
|
Chris@87
|
220 return []
|
Chris@87
|
221 return line.split(self.delimiter)
|
Chris@87
|
222 #
|
Chris@87
|
223
|
Chris@87
|
224 def _fixedwidth_splitter(self, line):
|
Chris@87
|
225 if self.comments is not None:
|
Chris@87
|
226 line = line.split(self.comments)[0]
|
Chris@87
|
227 line = line.strip(asbytes("\r\n"))
|
Chris@87
|
228 if not line:
|
Chris@87
|
229 return []
|
Chris@87
|
230 fixed = self.delimiter
|
Chris@87
|
231 slices = [slice(i, i + fixed) for i in range(0, len(line), fixed)]
|
Chris@87
|
232 return [line[s] for s in slices]
|
Chris@87
|
233 #
|
Chris@87
|
234
|
Chris@87
|
235 def _variablewidth_splitter(self, line):
|
Chris@87
|
236 if self.comments is not None:
|
Chris@87
|
237 line = line.split(self.comments)[0]
|
Chris@87
|
238 if not line:
|
Chris@87
|
239 return []
|
Chris@87
|
240 slices = self.delimiter
|
Chris@87
|
241 return [line[s] for s in slices]
|
Chris@87
|
242 #
|
Chris@87
|
243
|
Chris@87
|
244 def __call__(self, line):
|
Chris@87
|
245 return self._handyman(line)
|
Chris@87
|
246
|
Chris@87
|
247
|
Chris@87
|
248 class NameValidator(object):
|
Chris@87
|
249 """
|
Chris@87
|
250 Object to validate a list of strings to use as field names.
|
Chris@87
|
251
|
Chris@87
|
252 The strings are stripped of any non alphanumeric character, and spaces
|
Chris@87
|
253 are replaced by '_'. During instantiation, the user can define a list
|
Chris@87
|
254 of names to exclude, as well as a list of invalid characters. Names in
|
Chris@87
|
255 the exclusion list are appended a '_' character.
|
Chris@87
|
256
|
Chris@87
|
257 Once an instance has been created, it can be called with a list of
|
Chris@87
|
258 names, and a list of valid names will be created. The `__call__`
|
Chris@87
|
259 method accepts an optional keyword "default" that sets the default name
|
Chris@87
|
260 in case of ambiguity. By default this is 'f', so that names will
|
Chris@87
|
261 default to `f0`, `f1`, etc.
|
Chris@87
|
262
|
Chris@87
|
263 Parameters
|
Chris@87
|
264 ----------
|
Chris@87
|
265 excludelist : sequence, optional
|
Chris@87
|
266 A list of names to exclude. This list is appended to the default
|
Chris@87
|
267 list ['return', 'file', 'print']. Excluded names are appended an
|
Chris@87
|
268 underscore: for example, `file` becomes `file_` if supplied.
|
Chris@87
|
269 deletechars : str, optional
|
Chris@87
|
270 A string combining invalid characters that must be deleted from the
|
Chris@87
|
271 names.
|
Chris@87
|
272 casesensitive : {True, False, 'upper', 'lower'}, optional
|
Chris@87
|
273 * If True, field names are case-sensitive.
|
Chris@87
|
274 * If False or 'upper', field names are converted to upper case.
|
Chris@87
|
275 * If 'lower', field names are converted to lower case.
|
Chris@87
|
276
|
Chris@87
|
277 The default value is True.
|
Chris@87
|
278 replace_space : '_', optional
|
Chris@87
|
279 Character(s) used in replacement of white spaces.
|
Chris@87
|
280
|
Chris@87
|
281 Notes
|
Chris@87
|
282 -----
|
Chris@87
|
283 Calling an instance of `NameValidator` is the same as calling its
|
Chris@87
|
284 method `validate`.
|
Chris@87
|
285
|
Chris@87
|
286 Examples
|
Chris@87
|
287 --------
|
Chris@87
|
288 >>> validator = np.lib._iotools.NameValidator()
|
Chris@87
|
289 >>> validator(['file', 'field2', 'with space', 'CaSe'])
|
Chris@87
|
290 ['file_', 'field2', 'with_space', 'CaSe']
|
Chris@87
|
291
|
Chris@87
|
292 >>> validator = np.lib._iotools.NameValidator(excludelist=['excl'],
|
Chris@87
|
293 deletechars='q',
|
Chris@87
|
294 case_sensitive='False')
|
Chris@87
|
295 >>> validator(['excl', 'field2', 'no_q', 'with space', 'CaSe'])
|
Chris@87
|
296 ['excl_', 'field2', 'no_', 'with_space', 'case']
|
Chris@87
|
297
|
Chris@87
|
298 """
|
Chris@87
|
299 #
|
Chris@87
|
300 defaultexcludelist = ['return', 'file', 'print']
|
Chris@87
|
301 defaultdeletechars = set("""~!@#$%^&*()-=+~\|]}[{';: /?.>,<""")
|
Chris@87
|
302 #
|
Chris@87
|
303
|
Chris@87
|
304 def __init__(self, excludelist=None, deletechars=None,
|
Chris@87
|
305 case_sensitive=None, replace_space='_'):
|
Chris@87
|
306 # Process the exclusion list ..
|
Chris@87
|
307 if excludelist is None:
|
Chris@87
|
308 excludelist = []
|
Chris@87
|
309 excludelist.extend(self.defaultexcludelist)
|
Chris@87
|
310 self.excludelist = excludelist
|
Chris@87
|
311 # Process the list of characters to delete
|
Chris@87
|
312 if deletechars is None:
|
Chris@87
|
313 delete = self.defaultdeletechars
|
Chris@87
|
314 else:
|
Chris@87
|
315 delete = set(deletechars)
|
Chris@87
|
316 delete.add('"')
|
Chris@87
|
317 self.deletechars = delete
|
Chris@87
|
318 # Process the case option .....
|
Chris@87
|
319 if (case_sensitive is None) or (case_sensitive is True):
|
Chris@87
|
320 self.case_converter = lambda x: x
|
Chris@87
|
321 elif (case_sensitive is False) or ('u' in case_sensitive):
|
Chris@87
|
322 self.case_converter = lambda x: x.upper()
|
Chris@87
|
323 elif 'l' in case_sensitive:
|
Chris@87
|
324 self.case_converter = lambda x: x.lower()
|
Chris@87
|
325 else:
|
Chris@87
|
326 self.case_converter = lambda x: x
|
Chris@87
|
327 #
|
Chris@87
|
328 self.replace_space = replace_space
|
Chris@87
|
329
|
Chris@87
|
330 def validate(self, names, defaultfmt="f%i", nbfields=None):
|
Chris@87
|
331 """
|
Chris@87
|
332 Validate a list of strings as field names for a structured array.
|
Chris@87
|
333
|
Chris@87
|
334 Parameters
|
Chris@87
|
335 ----------
|
Chris@87
|
336 names : sequence of str
|
Chris@87
|
337 Strings to be validated.
|
Chris@87
|
338 defaultfmt : str, optional
|
Chris@87
|
339 Default format string, used if validating a given string
|
Chris@87
|
340 reduces its length to zero.
|
Chris@87
|
341 nboutput : integer, optional
|
Chris@87
|
342 Final number of validated names, used to expand or shrink the
|
Chris@87
|
343 initial list of names.
|
Chris@87
|
344
|
Chris@87
|
345 Returns
|
Chris@87
|
346 -------
|
Chris@87
|
347 validatednames : list of str
|
Chris@87
|
348 The list of validated field names.
|
Chris@87
|
349
|
Chris@87
|
350 Notes
|
Chris@87
|
351 -----
|
Chris@87
|
352 A `NameValidator` instance can be called directly, which is the
|
Chris@87
|
353 same as calling `validate`. For examples, see `NameValidator`.
|
Chris@87
|
354
|
Chris@87
|
355 """
|
Chris@87
|
356 # Initial checks ..............
|
Chris@87
|
357 if (names is None):
|
Chris@87
|
358 if (nbfields is None):
|
Chris@87
|
359 return None
|
Chris@87
|
360 names = []
|
Chris@87
|
361 if isinstance(names, basestring):
|
Chris@87
|
362 names = [names, ]
|
Chris@87
|
363 if nbfields is not None:
|
Chris@87
|
364 nbnames = len(names)
|
Chris@87
|
365 if (nbnames < nbfields):
|
Chris@87
|
366 names = list(names) + [''] * (nbfields - nbnames)
|
Chris@87
|
367 elif (nbnames > nbfields):
|
Chris@87
|
368 names = names[:nbfields]
|
Chris@87
|
369 # Set some shortcuts ...........
|
Chris@87
|
370 deletechars = self.deletechars
|
Chris@87
|
371 excludelist = self.excludelist
|
Chris@87
|
372 case_converter = self.case_converter
|
Chris@87
|
373 replace_space = self.replace_space
|
Chris@87
|
374 # Initializes some variables ...
|
Chris@87
|
375 validatednames = []
|
Chris@87
|
376 seen = dict()
|
Chris@87
|
377 nbempty = 0
|
Chris@87
|
378 #
|
Chris@87
|
379 for item in names:
|
Chris@87
|
380 item = case_converter(item).strip()
|
Chris@87
|
381 if replace_space:
|
Chris@87
|
382 item = item.replace(' ', replace_space)
|
Chris@87
|
383 item = ''.join([c for c in item if c not in deletechars])
|
Chris@87
|
384 if item == '':
|
Chris@87
|
385 item = defaultfmt % nbempty
|
Chris@87
|
386 while item in names:
|
Chris@87
|
387 nbempty += 1
|
Chris@87
|
388 item = defaultfmt % nbempty
|
Chris@87
|
389 nbempty += 1
|
Chris@87
|
390 elif item in excludelist:
|
Chris@87
|
391 item += '_'
|
Chris@87
|
392 cnt = seen.get(item, 0)
|
Chris@87
|
393 if cnt > 0:
|
Chris@87
|
394 validatednames.append(item + '_%d' % cnt)
|
Chris@87
|
395 else:
|
Chris@87
|
396 validatednames.append(item)
|
Chris@87
|
397 seen[item] = cnt + 1
|
Chris@87
|
398 return tuple(validatednames)
|
Chris@87
|
399 #
|
Chris@87
|
400
|
Chris@87
|
401 def __call__(self, names, defaultfmt="f%i", nbfields=None):
|
Chris@87
|
402 return self.validate(names, defaultfmt=defaultfmt, nbfields=nbfields)
|
Chris@87
|
403
|
Chris@87
|
404
|
Chris@87
|
405 def str2bool(value):
|
Chris@87
|
406 """
|
Chris@87
|
407 Tries to transform a string supposed to represent a boolean to a boolean.
|
Chris@87
|
408
|
Chris@87
|
409 Parameters
|
Chris@87
|
410 ----------
|
Chris@87
|
411 value : str
|
Chris@87
|
412 The string that is transformed to a boolean.
|
Chris@87
|
413
|
Chris@87
|
414 Returns
|
Chris@87
|
415 -------
|
Chris@87
|
416 boolval : bool
|
Chris@87
|
417 The boolean representation of `value`.
|
Chris@87
|
418
|
Chris@87
|
419 Raises
|
Chris@87
|
420 ------
|
Chris@87
|
421 ValueError
|
Chris@87
|
422 If the string is not 'True' or 'False' (case independent)
|
Chris@87
|
423
|
Chris@87
|
424 Examples
|
Chris@87
|
425 --------
|
Chris@87
|
426 >>> np.lib._iotools.str2bool('TRUE')
|
Chris@87
|
427 True
|
Chris@87
|
428 >>> np.lib._iotools.str2bool('false')
|
Chris@87
|
429 False
|
Chris@87
|
430
|
Chris@87
|
431 """
|
Chris@87
|
432 value = value.upper()
|
Chris@87
|
433 if value == asbytes('TRUE'):
|
Chris@87
|
434 return True
|
Chris@87
|
435 elif value == asbytes('FALSE'):
|
Chris@87
|
436 return False
|
Chris@87
|
437 else:
|
Chris@87
|
438 raise ValueError("Invalid boolean")
|
Chris@87
|
439
|
Chris@87
|
440
|
Chris@87
|
441 class ConverterError(Exception):
|
Chris@87
|
442 """
|
Chris@87
|
443 Exception raised when an error occurs in a converter for string values.
|
Chris@87
|
444
|
Chris@87
|
445 """
|
Chris@87
|
446 pass
|
Chris@87
|
447
|
Chris@87
|
448 class ConverterLockError(ConverterError):
|
Chris@87
|
449 """
|
Chris@87
|
450 Exception raised when an attempt is made to upgrade a locked converter.
|
Chris@87
|
451
|
Chris@87
|
452 """
|
Chris@87
|
453 pass
|
Chris@87
|
454
|
Chris@87
|
455 class ConversionWarning(UserWarning):
|
Chris@87
|
456 """
|
Chris@87
|
457 Warning issued when a string converter has a problem.
|
Chris@87
|
458
|
Chris@87
|
459 Notes
|
Chris@87
|
460 -----
|
Chris@87
|
461 In `genfromtxt` a `ConversionWarning` is issued if raising exceptions
|
Chris@87
|
462 is explicitly suppressed with the "invalid_raise" keyword.
|
Chris@87
|
463
|
Chris@87
|
464 """
|
Chris@87
|
465 pass
|
Chris@87
|
466
|
Chris@87
|
467
|
Chris@87
|
468 class StringConverter(object):
|
Chris@87
|
469 """
|
Chris@87
|
470 Factory class for function transforming a string into another object
|
Chris@87
|
471 (int, float).
|
Chris@87
|
472
|
Chris@87
|
473 After initialization, an instance can be called to transform a string
|
Chris@87
|
474 into another object. If the string is recognized as representing a
|
Chris@87
|
475 missing value, a default value is returned.
|
Chris@87
|
476
|
Chris@87
|
477 Attributes
|
Chris@87
|
478 ----------
|
Chris@87
|
479 func : function
|
Chris@87
|
480 Function used for the conversion.
|
Chris@87
|
481 default : any
|
Chris@87
|
482 Default value to return when the input corresponds to a missing
|
Chris@87
|
483 value.
|
Chris@87
|
484 type : type
|
Chris@87
|
485 Type of the output.
|
Chris@87
|
486 _status : int
|
Chris@87
|
487 Integer representing the order of the conversion.
|
Chris@87
|
488 _mapper : sequence of tuples
|
Chris@87
|
489 Sequence of tuples (dtype, function, default value) to evaluate in
|
Chris@87
|
490 order.
|
Chris@87
|
491 _locked : bool
|
Chris@87
|
492 Holds `locked` parameter.
|
Chris@87
|
493
|
Chris@87
|
494 Parameters
|
Chris@87
|
495 ----------
|
Chris@87
|
496 dtype_or_func : {None, dtype, function}, optional
|
Chris@87
|
497 If a `dtype`, specifies the input data type, used to define a basic
|
Chris@87
|
498 function and a default value for missing data. For example, when
|
Chris@87
|
499 `dtype` is float, the `func` attribute is set to `float` and the
|
Chris@87
|
500 default value to `np.nan`. If a function, this function is used to
|
Chris@87
|
501 convert a string to another object. In this case, it is recommended
|
Chris@87
|
502 to give an associated default value as input.
|
Chris@87
|
503 default : any, optional
|
Chris@87
|
504 Value to return by default, that is, when the string to be
|
Chris@87
|
505 converted is flagged as missing. If not given, `StringConverter`
|
Chris@87
|
506 tries to supply a reasonable default value.
|
Chris@87
|
507 missing_values : sequence of str, optional
|
Chris@87
|
508 Sequence of strings indicating a missing value.
|
Chris@87
|
509 locked : bool, optional
|
Chris@87
|
510 Whether the StringConverter should be locked to prevent automatic
|
Chris@87
|
511 upgrade or not. Default is False.
|
Chris@87
|
512
|
Chris@87
|
513 """
|
Chris@87
|
514 #
|
Chris@87
|
515 _mapper = [(nx.bool_, str2bool, False),
|
Chris@87
|
516 (nx.integer, int, -1),
|
Chris@87
|
517 (nx.floating, float, nx.nan),
|
Chris@87
|
518 (complex, _bytes_to_complex, nx.nan + 0j),
|
Chris@87
|
519 (nx.string_, bytes, asbytes('???'))]
|
Chris@87
|
520 (_defaulttype, _defaultfunc, _defaultfill) = zip(*_mapper)
|
Chris@87
|
521 #
|
Chris@87
|
522
|
Chris@87
|
523 @classmethod
|
Chris@87
|
524 def _getdtype(cls, val):
|
Chris@87
|
525 """Returns the dtype of the input variable."""
|
Chris@87
|
526 return np.array(val).dtype
|
Chris@87
|
527 #
|
Chris@87
|
528
|
Chris@87
|
529 @classmethod
|
Chris@87
|
530 def _getsubdtype(cls, val):
|
Chris@87
|
531 """Returns the type of the dtype of the input variable."""
|
Chris@87
|
532 return np.array(val).dtype.type
|
Chris@87
|
533 #
|
Chris@87
|
534 # This is a bit annoying. We want to return the "general" type in most
|
Chris@87
|
535 # cases (ie. "string" rather than "S10"), but we want to return the
|
Chris@87
|
536 # specific type for datetime64 (ie. "datetime64[us]" rather than
|
Chris@87
|
537 # "datetime64").
|
Chris@87
|
538
|
Chris@87
|
539 @classmethod
|
Chris@87
|
540 def _dtypeortype(cls, dtype):
|
Chris@87
|
541 """Returns dtype for datetime64 and type of dtype otherwise."""
|
Chris@87
|
542 if dtype.type == np.datetime64:
|
Chris@87
|
543 return dtype
|
Chris@87
|
544 return dtype.type
|
Chris@87
|
545 #
|
Chris@87
|
546
|
Chris@87
|
547 @classmethod
|
Chris@87
|
548 def upgrade_mapper(cls, func, default=None):
|
Chris@87
|
549 """
|
Chris@87
|
550 Upgrade the mapper of a StringConverter by adding a new function and
|
Chris@87
|
551 its corresponding default.
|
Chris@87
|
552
|
Chris@87
|
553 The input function (or sequence of functions) and its associated
|
Chris@87
|
554 default value (if any) is inserted in penultimate position of the
|
Chris@87
|
555 mapper. The corresponding type is estimated from the dtype of the
|
Chris@87
|
556 default value.
|
Chris@87
|
557
|
Chris@87
|
558 Parameters
|
Chris@87
|
559 ----------
|
Chris@87
|
560 func : var
|
Chris@87
|
561 Function, or sequence of functions
|
Chris@87
|
562
|
Chris@87
|
563 Examples
|
Chris@87
|
564 --------
|
Chris@87
|
565 >>> import dateutil.parser
|
Chris@87
|
566 >>> import datetime
|
Chris@87
|
567 >>> dateparser = datetustil.parser.parse
|
Chris@87
|
568 >>> defaultdate = datetime.date(2000, 1, 1)
|
Chris@87
|
569 >>> StringConverter.upgrade_mapper(dateparser, default=defaultdate)
|
Chris@87
|
570 """
|
Chris@87
|
571 # Func is a single functions
|
Chris@87
|
572 if hasattr(func, '__call__'):
|
Chris@87
|
573 cls._mapper.insert(-1, (cls._getsubdtype(default), func, default))
|
Chris@87
|
574 return
|
Chris@87
|
575 elif hasattr(func, '__iter__'):
|
Chris@87
|
576 if isinstance(func[0], (tuple, list)):
|
Chris@87
|
577 for _ in func:
|
Chris@87
|
578 cls._mapper.insert(-1, _)
|
Chris@87
|
579 return
|
Chris@87
|
580 if default is None:
|
Chris@87
|
581 default = [None] * len(func)
|
Chris@87
|
582 else:
|
Chris@87
|
583 default = list(default)
|
Chris@87
|
584 default.append([None] * (len(func) - len(default)))
|
Chris@87
|
585 for (fct, dft) in zip(func, default):
|
Chris@87
|
586 cls._mapper.insert(-1, (cls._getsubdtype(dft), fct, dft))
|
Chris@87
|
587 #
|
Chris@87
|
588
|
Chris@87
|
589 def __init__(self, dtype_or_func=None, default=None, missing_values=None,
|
Chris@87
|
590 locked=False):
|
Chris@87
|
591 # Convert unicode (for Py3)
|
Chris@87
|
592 if isinstance(missing_values, unicode):
|
Chris@87
|
593 missing_values = asbytes(missing_values)
|
Chris@87
|
594 elif isinstance(missing_values, (list, tuple)):
|
Chris@87
|
595 missing_values = asbytes_nested(missing_values)
|
Chris@87
|
596 # Defines a lock for upgrade
|
Chris@87
|
597 self._locked = bool(locked)
|
Chris@87
|
598 # No input dtype: minimal initialization
|
Chris@87
|
599 if dtype_or_func is None:
|
Chris@87
|
600 self.func = str2bool
|
Chris@87
|
601 self._status = 0
|
Chris@87
|
602 self.default = default or False
|
Chris@87
|
603 dtype = np.dtype('bool')
|
Chris@87
|
604 else:
|
Chris@87
|
605 # Is the input a np.dtype ?
|
Chris@87
|
606 try:
|
Chris@87
|
607 self.func = None
|
Chris@87
|
608 dtype = np.dtype(dtype_or_func)
|
Chris@87
|
609 except TypeError:
|
Chris@87
|
610 # dtype_or_func must be a function, then
|
Chris@87
|
611 if not hasattr(dtype_or_func, '__call__'):
|
Chris@87
|
612 errmsg = ("The input argument `dtype` is neither a"
|
Chris@87
|
613 " function nor a dtype (got '%s' instead)")
|
Chris@87
|
614 raise TypeError(errmsg % type(dtype_or_func))
|
Chris@87
|
615 # Set the function
|
Chris@87
|
616 self.func = dtype_or_func
|
Chris@87
|
617 # If we don't have a default, try to guess it or set it to
|
Chris@87
|
618 # None
|
Chris@87
|
619 if default is None:
|
Chris@87
|
620 try:
|
Chris@87
|
621 default = self.func(asbytes('0'))
|
Chris@87
|
622 except ValueError:
|
Chris@87
|
623 default = None
|
Chris@87
|
624 dtype = self._getdtype(default)
|
Chris@87
|
625 # Set the status according to the dtype
|
Chris@87
|
626 _status = -1
|
Chris@87
|
627 for (i, (deftype, func, default_def)) in enumerate(self._mapper):
|
Chris@87
|
628 if np.issubdtype(dtype.type, deftype):
|
Chris@87
|
629 _status = i
|
Chris@87
|
630 if default is None:
|
Chris@87
|
631 self.default = default_def
|
Chris@87
|
632 else:
|
Chris@87
|
633 self.default = default
|
Chris@87
|
634 break
|
Chris@87
|
635 if _status == -1:
|
Chris@87
|
636 # We never found a match in the _mapper...
|
Chris@87
|
637 _status = 0
|
Chris@87
|
638 self.default = default
|
Chris@87
|
639 self._status = _status
|
Chris@87
|
640 # If the input was a dtype, set the function to the last we saw
|
Chris@87
|
641 if self.func is None:
|
Chris@87
|
642 self.func = func
|
Chris@87
|
643 # If the status is 1 (int), change the function to
|
Chris@87
|
644 # something more robust.
|
Chris@87
|
645 if self.func == self._mapper[1][1]:
|
Chris@87
|
646 if issubclass(dtype.type, np.uint64):
|
Chris@87
|
647 self.func = np.uint64
|
Chris@87
|
648 elif issubclass(dtype.type, np.int64):
|
Chris@87
|
649 self.func = np.int64
|
Chris@87
|
650 else:
|
Chris@87
|
651 self.func = lambda x: int(float(x))
|
Chris@87
|
652 # Store the list of strings corresponding to missing values.
|
Chris@87
|
653 if missing_values is None:
|
Chris@87
|
654 self.missing_values = set([asbytes('')])
|
Chris@87
|
655 else:
|
Chris@87
|
656 if isinstance(missing_values, bytes):
|
Chris@87
|
657 missing_values = missing_values.split(asbytes(","))
|
Chris@87
|
658 self.missing_values = set(list(missing_values) + [asbytes('')])
|
Chris@87
|
659 #
|
Chris@87
|
660 self._callingfunction = self._strict_call
|
Chris@87
|
661 self.type = self._dtypeortype(dtype)
|
Chris@87
|
662 self._checked = False
|
Chris@87
|
663 self._initial_default = default
|
Chris@87
|
664 #
|
Chris@87
|
665
|
Chris@87
|
666 def _loose_call(self, value):
|
Chris@87
|
667 try:
|
Chris@87
|
668 return self.func(value)
|
Chris@87
|
669 except ValueError:
|
Chris@87
|
670 return self.default
|
Chris@87
|
671 #
|
Chris@87
|
672
|
Chris@87
|
673 def _strict_call(self, value):
|
Chris@87
|
674 try:
|
Chris@87
|
675 return self.func(value)
|
Chris@87
|
676 except ValueError:
|
Chris@87
|
677 if value.strip() in self.missing_values:
|
Chris@87
|
678 if not self._status:
|
Chris@87
|
679 self._checked = False
|
Chris@87
|
680 return self.default
|
Chris@87
|
681 raise ValueError("Cannot convert string '%s'" % value)
|
Chris@87
|
682 #
|
Chris@87
|
683
|
Chris@87
|
684 def __call__(self, value):
|
Chris@87
|
685 return self._callingfunction(value)
|
Chris@87
|
686 #
|
Chris@87
|
687
|
Chris@87
|
688 def upgrade(self, value):
|
Chris@87
|
689 """
|
Chris@87
|
690 Find the best converter for a given string, and return the result.
|
Chris@87
|
691
|
Chris@87
|
692 The supplied string `value` is converted by testing different
|
Chris@87
|
693 converters in order. First the `func` method of the
|
Chris@87
|
694 `StringConverter` instance is tried, if this fails other available
|
Chris@87
|
695 converters are tried. The order in which these other converters
|
Chris@87
|
696 are tried is determined by the `_status` attribute of the instance.
|
Chris@87
|
697
|
Chris@87
|
698 Parameters
|
Chris@87
|
699 ----------
|
Chris@87
|
700 value : str
|
Chris@87
|
701 The string to convert.
|
Chris@87
|
702
|
Chris@87
|
703 Returns
|
Chris@87
|
704 -------
|
Chris@87
|
705 out : any
|
Chris@87
|
706 The result of converting `value` with the appropriate converter.
|
Chris@87
|
707
|
Chris@87
|
708 """
|
Chris@87
|
709 self._checked = True
|
Chris@87
|
710 try:
|
Chris@87
|
711 self._strict_call(value)
|
Chris@87
|
712 except ValueError:
|
Chris@87
|
713 # Raise an exception if we locked the converter...
|
Chris@87
|
714 if self._locked:
|
Chris@87
|
715 errmsg = "Converter is locked and cannot be upgraded"
|
Chris@87
|
716 raise ConverterLockError(errmsg)
|
Chris@87
|
717 _statusmax = len(self._mapper)
|
Chris@87
|
718 # Complains if we try to upgrade by the maximum
|
Chris@87
|
719 _status = self._status
|
Chris@87
|
720 if _status == _statusmax:
|
Chris@87
|
721 errmsg = "Could not find a valid conversion function"
|
Chris@87
|
722 raise ConverterError(errmsg)
|
Chris@87
|
723 elif _status < _statusmax - 1:
|
Chris@87
|
724 _status += 1
|
Chris@87
|
725 (self.type, self.func, default) = self._mapper[_status]
|
Chris@87
|
726 self._status = _status
|
Chris@87
|
727 if self._initial_default is not None:
|
Chris@87
|
728 self.default = self._initial_default
|
Chris@87
|
729 else:
|
Chris@87
|
730 self.default = default
|
Chris@87
|
731 self.upgrade(value)
|
Chris@87
|
732
|
Chris@87
|
733 def iterupgrade(self, value):
|
Chris@87
|
734 self._checked = True
|
Chris@87
|
735 if not hasattr(value, '__iter__'):
|
Chris@87
|
736 value = (value,)
|
Chris@87
|
737 _strict_call = self._strict_call
|
Chris@87
|
738 try:
|
Chris@87
|
739 for _m in value:
|
Chris@87
|
740 _strict_call(_m)
|
Chris@87
|
741 except ValueError:
|
Chris@87
|
742 # Raise an exception if we locked the converter...
|
Chris@87
|
743 if self._locked:
|
Chris@87
|
744 errmsg = "Converter is locked and cannot be upgraded"
|
Chris@87
|
745 raise ConverterLockError(errmsg)
|
Chris@87
|
746 _statusmax = len(self._mapper)
|
Chris@87
|
747 # Complains if we try to upgrade by the maximum
|
Chris@87
|
748 _status = self._status
|
Chris@87
|
749 if _status == _statusmax:
|
Chris@87
|
750 raise ConverterError(
|
Chris@87
|
751 "Could not find a valid conversion function"
|
Chris@87
|
752 )
|
Chris@87
|
753 elif _status < _statusmax - 1:
|
Chris@87
|
754 _status += 1
|
Chris@87
|
755 (self.type, self.func, default) = self._mapper[_status]
|
Chris@87
|
756 if self._initial_default is not None:
|
Chris@87
|
757 self.default = self._initial_default
|
Chris@87
|
758 else:
|
Chris@87
|
759 self.default = default
|
Chris@87
|
760 self._status = _status
|
Chris@87
|
761 self.iterupgrade(value)
|
Chris@87
|
762
|
Chris@87
|
763 def update(self, func, default=None, testing_value=None,
|
Chris@87
|
764 missing_values=asbytes(''), locked=False):
|
Chris@87
|
765 """
|
Chris@87
|
766 Set StringConverter attributes directly.
|
Chris@87
|
767
|
Chris@87
|
768 Parameters
|
Chris@87
|
769 ----------
|
Chris@87
|
770 func : function
|
Chris@87
|
771 Conversion function.
|
Chris@87
|
772 default : any, optional
|
Chris@87
|
773 Value to return by default, that is, when the string to be
|
Chris@87
|
774 converted is flagged as missing. If not given,
|
Chris@87
|
775 `StringConverter` tries to supply a reasonable default value.
|
Chris@87
|
776 testing_value : str, optional
|
Chris@87
|
777 A string representing a standard input value of the converter.
|
Chris@87
|
778 This string is used to help defining a reasonable default
|
Chris@87
|
779 value.
|
Chris@87
|
780 missing_values : sequence of str, optional
|
Chris@87
|
781 Sequence of strings indicating a missing value.
|
Chris@87
|
782 locked : bool, optional
|
Chris@87
|
783 Whether the StringConverter should be locked to prevent
|
Chris@87
|
784 automatic upgrade or not. Default is False.
|
Chris@87
|
785
|
Chris@87
|
786 Notes
|
Chris@87
|
787 -----
|
Chris@87
|
788 `update` takes the same parameters as the constructor of
|
Chris@87
|
789 `StringConverter`, except that `func` does not accept a `dtype`
|
Chris@87
|
790 whereas `dtype_or_func` in the constructor does.
|
Chris@87
|
791
|
Chris@87
|
792 """
|
Chris@87
|
793 self.func = func
|
Chris@87
|
794 self._locked = locked
|
Chris@87
|
795 # Don't reset the default to None if we can avoid it
|
Chris@87
|
796 if default is not None:
|
Chris@87
|
797 self.default = default
|
Chris@87
|
798 self.type = self._dtypeortype(self._getdtype(default))
|
Chris@87
|
799 else:
|
Chris@87
|
800 try:
|
Chris@87
|
801 tester = func(testing_value or asbytes('1'))
|
Chris@87
|
802 except (TypeError, ValueError):
|
Chris@87
|
803 tester = None
|
Chris@87
|
804 self.type = self._dtypeortype(self._getdtype(tester))
|
Chris@87
|
805 # Add the missing values to the existing set
|
Chris@87
|
806 if missing_values is not None:
|
Chris@87
|
807 if _is_bytes_like(missing_values):
|
Chris@87
|
808 self.missing_values.add(missing_values)
|
Chris@87
|
809 elif hasattr(missing_values, '__iter__'):
|
Chris@87
|
810 for val in missing_values:
|
Chris@87
|
811 self.missing_values.add(val)
|
Chris@87
|
812 else:
|
Chris@87
|
813 self.missing_values = []
|
Chris@87
|
814
|
Chris@87
|
815
|
Chris@87
|
816 def easy_dtype(ndtype, names=None, defaultfmt="f%i", **validationargs):
|
Chris@87
|
817 """
|
Chris@87
|
818 Convenience function to create a `np.dtype` object.
|
Chris@87
|
819
|
Chris@87
|
820 The function processes the input `dtype` and matches it with the given
|
Chris@87
|
821 names.
|
Chris@87
|
822
|
Chris@87
|
823 Parameters
|
Chris@87
|
824 ----------
|
Chris@87
|
825 ndtype : var
|
Chris@87
|
826 Definition of the dtype. Can be any string or dictionary recognized
|
Chris@87
|
827 by the `np.dtype` function, or a sequence of types.
|
Chris@87
|
828 names : str or sequence, optional
|
Chris@87
|
829 Sequence of strings to use as field names for a structured dtype.
|
Chris@87
|
830 For convenience, `names` can be a string of a comma-separated list
|
Chris@87
|
831 of names.
|
Chris@87
|
832 defaultfmt : str, optional
|
Chris@87
|
833 Format string used to define missing names, such as ``"f%i"``
|
Chris@87
|
834 (default) or ``"fields_%02i"``.
|
Chris@87
|
835 validationargs : optional
|
Chris@87
|
836 A series of optional arguments used to initialize a
|
Chris@87
|
837 `NameValidator`.
|
Chris@87
|
838
|
Chris@87
|
839 Examples
|
Chris@87
|
840 --------
|
Chris@87
|
841 >>> np.lib._iotools.easy_dtype(float)
|
Chris@87
|
842 dtype('float64')
|
Chris@87
|
843 >>> np.lib._iotools.easy_dtype("i4, f8")
|
Chris@87
|
844 dtype([('f0', '<i4'), ('f1', '<f8')])
|
Chris@87
|
845 >>> np.lib._iotools.easy_dtype("i4, f8", defaultfmt="field_%03i")
|
Chris@87
|
846 dtype([('field_000', '<i4'), ('field_001', '<f8')])
|
Chris@87
|
847
|
Chris@87
|
848 >>> np.lib._iotools.easy_dtype((int, float, float), names="a,b,c")
|
Chris@87
|
849 dtype([('a', '<i8'), ('b', '<f8'), ('c', '<f8')])
|
Chris@87
|
850 >>> np.lib._iotools.easy_dtype(float, names="a,b,c")
|
Chris@87
|
851 dtype([('a', '<f8'), ('b', '<f8'), ('c', '<f8')])
|
Chris@87
|
852
|
Chris@87
|
853 """
|
Chris@87
|
854 try:
|
Chris@87
|
855 ndtype = np.dtype(ndtype)
|
Chris@87
|
856 except TypeError:
|
Chris@87
|
857 validate = NameValidator(**validationargs)
|
Chris@87
|
858 nbfields = len(ndtype)
|
Chris@87
|
859 if names is None:
|
Chris@87
|
860 names = [''] * len(ndtype)
|
Chris@87
|
861 elif isinstance(names, basestring):
|
Chris@87
|
862 names = names.split(",")
|
Chris@87
|
863 names = validate(names, nbfields=nbfields, defaultfmt=defaultfmt)
|
Chris@87
|
864 ndtype = np.dtype(dict(formats=ndtype, names=names))
|
Chris@87
|
865 else:
|
Chris@87
|
866 nbtypes = len(ndtype)
|
Chris@87
|
867 # Explicit names
|
Chris@87
|
868 if names is not None:
|
Chris@87
|
869 validate = NameValidator(**validationargs)
|
Chris@87
|
870 if isinstance(names, basestring):
|
Chris@87
|
871 names = names.split(",")
|
Chris@87
|
872 # Simple dtype: repeat to match the nb of names
|
Chris@87
|
873 if nbtypes == 0:
|
Chris@87
|
874 formats = tuple([ndtype.type] * len(names))
|
Chris@87
|
875 names = validate(names, defaultfmt=defaultfmt)
|
Chris@87
|
876 ndtype = np.dtype(list(zip(names, formats)))
|
Chris@87
|
877 # Structured dtype: just validate the names as needed
|
Chris@87
|
878 else:
|
Chris@87
|
879 ndtype.names = validate(names, nbfields=nbtypes,
|
Chris@87
|
880 defaultfmt=defaultfmt)
|
Chris@87
|
881 # No implicit names
|
Chris@87
|
882 elif (nbtypes > 0):
|
Chris@87
|
883 validate = NameValidator(**validationargs)
|
Chris@87
|
884 # Default initial names : should we change the format ?
|
Chris@87
|
885 if ((ndtype.names == tuple("f%i" % i for i in range(nbtypes))) and
|
Chris@87
|
886 (defaultfmt != "f%i")):
|
Chris@87
|
887 ndtype.names = validate([''] * nbtypes, defaultfmt=defaultfmt)
|
Chris@87
|
888 # Explicit initial names : just validate
|
Chris@87
|
889 else:
|
Chris@87
|
890 ndtype.names = validate(ndtype.names, defaultfmt=defaultfmt)
|
Chris@87
|
891 return ndtype
|