comparison DEPENDENCIES/mingw32/Python27/Lib/site-packages/numpy/lib/npyio.py @ 87:2a2c65a20a8b

Add Python libs and headers
author Chris Cannam
date Wed, 25 Feb 2015 14:05:22 +0000
parents
children
comparison
equal deleted inserted replaced
86:413a9d26189e 87:2a2c65a20a8b
1 from __future__ import division, absolute_import, print_function
2
3 import sys
4 import os
5 import re
6 import itertools
7 import warnings
8 import weakref
9 from operator import itemgetter
10
11 import numpy as np
12 from . import format
13 from ._datasource import DataSource
14 from ._compiled_base import packbits, unpackbits
15 from ._iotools import (
16 LineSplitter, NameValidator, StringConverter, ConverterError,
17 ConverterLockError, ConversionWarning, _is_string_like, has_nested_fields,
18 flatten_dtype, easy_dtype, _bytes_to_name
19 )
20
21 from numpy.compat import (
22 asbytes, asstr, asbytes_nested, bytes, basestring, unicode
23 )
24
25 if sys.version_info[0] >= 3:
26 import pickle
27 else:
28 import cPickle as pickle
29 from future_builtins import map
30
31 loads = pickle.loads
32
33 __all__ = [
34 'savetxt', 'loadtxt', 'genfromtxt', 'ndfromtxt', 'mafromtxt',
35 'recfromtxt', 'recfromcsv', 'load', 'loads', 'save', 'savez',
36 'savez_compressed', 'packbits', 'unpackbits', 'fromregex', 'DataSource'
37 ]
38
39
40 def seek_gzip_factory(f):
41 """Use this factory to produce the class so that we can do a lazy
42 import on gzip.
43
44 """
45 import gzip
46
47 class GzipFile(gzip.GzipFile):
48
49 def seek(self, offset, whence=0):
50 # figure out new position (we can only seek forwards)
51 if whence == 1:
52 offset = self.offset + offset
53
54 if whence not in [0, 1]:
55 raise IOError("Illegal argument")
56
57 if offset < self.offset:
58 # for negative seek, rewind and do positive seek
59 self.rewind()
60 count = offset - self.offset
61 for i in range(count // 1024):
62 self.read(1024)
63 self.read(count % 1024)
64
65 def tell(self):
66 return self.offset
67
68 if isinstance(f, str):
69 f = GzipFile(f)
70 elif isinstance(f, gzip.GzipFile):
71 # cast to our GzipFile if its already a gzip.GzipFile
72
73 try:
74 name = f.name
75 except AttributeError:
76 # Backward compatibility for <= 2.5
77 name = f.filename
78 mode = f.mode
79
80 f = GzipFile(fileobj=f.fileobj, filename=name)
81 f.mode = mode
82
83 return f
84
85
86 class BagObj(object):
87 """
88 BagObj(obj)
89
90 Convert attribute look-ups to getitems on the object passed in.
91
92 Parameters
93 ----------
94 obj : class instance
95 Object on which attribute look-up is performed.
96
97 Examples
98 --------
99 >>> from numpy.lib.npyio import BagObj as BO
100 >>> class BagDemo(object):
101 ... def __getitem__(self, key): # An instance of BagObj(BagDemo)
102 ... # will call this method when any
103 ... # attribute look-up is required
104 ... result = "Doesn't matter what you want, "
105 ... return result + "you're gonna get this"
106 ...
107 >>> demo_obj = BagDemo()
108 >>> bagobj = BO(demo_obj)
109 >>> bagobj.hello_there
110 "Doesn't matter what you want, you're gonna get this"
111 >>> bagobj.I_can_be_anything
112 "Doesn't matter what you want, you're gonna get this"
113
114 """
115
116 def __init__(self, obj):
117 # Use weakref to make NpzFile objects collectable by refcount
118 self._obj = weakref.proxy(obj)
119
120 def __getattribute__(self, key):
121 try:
122 return object.__getattribute__(self, '_obj')[key]
123 except KeyError:
124 raise AttributeError(key)
125
126
127 def zipfile_factory(*args, **kwargs):
128 import zipfile
129 kwargs['allowZip64'] = True
130 return zipfile.ZipFile(*args, **kwargs)
131
132
133 class NpzFile(object):
134 """
135 NpzFile(fid)
136
137 A dictionary-like object with lazy-loading of files in the zipped
138 archive provided on construction.
139
140 `NpzFile` is used to load files in the NumPy ``.npz`` data archive
141 format. It assumes that files in the archive have a ``.npy`` extension,
142 other files are ignored.
143
144 The arrays and file strings are lazily loaded on either
145 getitem access using ``obj['key']`` or attribute lookup using
146 ``obj.f.key``. A list of all files (without ``.npy`` extensions) can
147 be obtained with ``obj.files`` and the ZipFile object itself using
148 ``obj.zip``.
149
150 Attributes
151 ----------
152 files : list of str
153 List of all files in the archive with a ``.npy`` extension.
154 zip : ZipFile instance
155 The ZipFile object initialized with the zipped archive.
156 f : BagObj instance
157 An object on which attribute can be performed as an alternative
158 to getitem access on the `NpzFile` instance itself.
159
160 Parameters
161 ----------
162 fid : file or str
163 The zipped archive to open. This is either a file-like object
164 or a string containing the path to the archive.
165 own_fid : bool, optional
166 Whether NpzFile should close the file handle.
167 Requires that `fid` is a file-like object.
168
169 Examples
170 --------
171 >>> from tempfile import TemporaryFile
172 >>> outfile = TemporaryFile()
173 >>> x = np.arange(10)
174 >>> y = np.sin(x)
175 >>> np.savez(outfile, x=x, y=y)
176 >>> outfile.seek(0)
177
178 >>> npz = np.load(outfile)
179 >>> isinstance(npz, np.lib.io.NpzFile)
180 True
181 >>> npz.files
182 ['y', 'x']
183 >>> npz['x'] # getitem access
184 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
185 >>> npz.f.x # attribute lookup
186 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
187
188 """
189
190 def __init__(self, fid, own_fid=False):
191 # Import is postponed to here since zipfile depends on gzip, an
192 # optional component of the so-called standard library.
193 _zip = zipfile_factory(fid)
194 self._files = _zip.namelist()
195 self.files = []
196 for x in self._files:
197 if x.endswith('.npy'):
198 self.files.append(x[:-4])
199 else:
200 self.files.append(x)
201 self.zip = _zip
202 self.f = BagObj(self)
203 if own_fid:
204 self.fid = fid
205 else:
206 self.fid = None
207
208 def __enter__(self):
209 return self
210
211 def __exit__(self, exc_type, exc_value, traceback):
212 self.close()
213
214 def close(self):
215 """
216 Close the file.
217
218 """
219 if self.zip is not None:
220 self.zip.close()
221 self.zip = None
222 if self.fid is not None:
223 self.fid.close()
224 self.fid = None
225 self.f = None # break reference cycle
226
227 def __del__(self):
228 self.close()
229
230 def __getitem__(self, key):
231 # FIXME: This seems like it will copy strings around
232 # more than is strictly necessary. The zipfile
233 # will read the string and then
234 # the format.read_array will copy the string
235 # to another place in memory.
236 # It would be better if the zipfile could read
237 # (or at least uncompress) the data
238 # directly into the array memory.
239 member = 0
240 if key in self._files:
241 member = 1
242 elif key in self.files:
243 member = 1
244 key += '.npy'
245 if member:
246 bytes = self.zip.open(key)
247 magic = bytes.read(len(format.MAGIC_PREFIX))
248 bytes.close()
249 if magic == format.MAGIC_PREFIX:
250 bytes = self.zip.open(key)
251 return format.read_array(bytes)
252 else:
253 return self.zip.read(key)
254 else:
255 raise KeyError("%s is not a file in the archive" % key)
256
257 def __iter__(self):
258 return iter(self.files)
259
260 def items(self):
261 """
262 Return a list of tuples, with each tuple (filename, array in file).
263
264 """
265 return [(f, self[f]) for f in self.files]
266
267 def iteritems(self):
268 """Generator that returns tuples (filename, array in file)."""
269 for f in self.files:
270 yield (f, self[f])
271
272 def keys(self):
273 """Return files in the archive with a ``.npy`` extension."""
274 return self.files
275
276 def iterkeys(self):
277 """Return an iterator over the files in the archive."""
278 return self.__iter__()
279
280 def __contains__(self, key):
281 return self.files.__contains__(key)
282
283
284 def load(file, mmap_mode=None):
285 """
286 Load arrays or pickled objects from ``.npy``, ``.npz`` or pickled files.
287
288 Parameters
289 ----------
290 file : file-like object or string
291 The file to read. File-like objects must support the
292 ``seek()`` and ``read()`` methods. Pickled files require that the
293 file-like object support the ``readline()`` method as well.
294 mmap_mode : {None, 'r+', 'r', 'w+', 'c'}, optional
295 If not None, then memory-map the file, using the given mode (see
296 `numpy.memmap` for a detailed description of the modes). A
297 memory-mapped array is kept on disk. However, it can be accessed
298 and sliced like any ndarray. Memory mapping is especially useful
299 for accessing small fragments of large files without reading the
300 entire file into memory.
301
302 Returns
303 -------
304 result : array, tuple, dict, etc.
305 Data stored in the file. For ``.npz`` files, the returned instance
306 of NpzFile class must be closed to avoid leaking file descriptors.
307
308 Raises
309 ------
310 IOError
311 If the input file does not exist or cannot be read.
312
313 See Also
314 --------
315 save, savez, savez_compressed, loadtxt
316 memmap : Create a memory-map to an array stored in a file on disk.
317
318 Notes
319 -----
320 - If the file contains pickle data, then whatever object is stored
321 in the pickle is returned.
322 - If the file is a ``.npy`` file, then a single array is returned.
323 - If the file is a ``.npz`` file, then a dictionary-like object is
324 returned, containing ``{filename: array}`` key-value pairs, one for
325 each file in the archive.
326 - If the file is a ``.npz`` file, the returned value supports the
327 context manager protocol in a similar fashion to the open function::
328
329 with load('foo.npz') as data:
330 a = data['a']
331
332 The underlying file descriptor is closed when exiting the 'with'
333 block.
334
335 Examples
336 --------
337 Store data to disk, and load it again:
338
339 >>> np.save('/tmp/123', np.array([[1, 2, 3], [4, 5, 6]]))
340 >>> np.load('/tmp/123.npy')
341 array([[1, 2, 3],
342 [4, 5, 6]])
343
344 Store compressed data to disk, and load it again:
345
346 >>> a=np.array([[1, 2, 3], [4, 5, 6]])
347 >>> b=np.array([1, 2])
348 >>> np.savez('/tmp/123.npz', a=a, b=b)
349 >>> data = np.load('/tmp/123.npz')
350 >>> data['a']
351 array([[1, 2, 3],
352 [4, 5, 6]])
353 >>> data['b']
354 array([1, 2])
355 >>> data.close()
356
357 Mem-map the stored array, and then access the second row
358 directly from disk:
359
360 >>> X = np.load('/tmp/123.npy', mmap_mode='r')
361 >>> X[1, :]
362 memmap([4, 5, 6])
363
364 """
365 import gzip
366
367 own_fid = False
368 if isinstance(file, basestring):
369 fid = open(file, "rb")
370 own_fid = True
371 elif isinstance(file, gzip.GzipFile):
372 fid = seek_gzip_factory(file)
373 else:
374 fid = file
375
376 try:
377 # Code to distinguish from NumPy binary files and pickles.
378 _ZIP_PREFIX = asbytes('PK\x03\x04')
379 N = len(format.MAGIC_PREFIX)
380 magic = fid.read(N)
381 fid.seek(-N, 1) # back-up
382 if magic.startswith(_ZIP_PREFIX):
383 # zip-file (assume .npz)
384 # Transfer file ownership to NpzFile
385 tmp = own_fid
386 own_fid = False
387 return NpzFile(fid, own_fid=tmp)
388 elif magic == format.MAGIC_PREFIX:
389 # .npy file
390 if mmap_mode:
391 return format.open_memmap(file, mode=mmap_mode)
392 else:
393 return format.read_array(fid)
394 else:
395 # Try a pickle
396 try:
397 return pickle.load(fid)
398 except:
399 raise IOError(
400 "Failed to interpret file %s as a pickle" % repr(file))
401 finally:
402 if own_fid:
403 fid.close()
404
405
406 def save(file, arr):
407 """
408 Save an array to a binary file in NumPy ``.npy`` format.
409
410 Parameters
411 ----------
412 file : file or str
413 File or filename to which the data is saved. If file is a file-object,
414 then the filename is unchanged. If file is a string, a ``.npy``
415 extension will be appended to the file name if it does not already
416 have one.
417 arr : array_like
418 Array data to be saved.
419
420 See Also
421 --------
422 savez : Save several arrays into a ``.npz`` archive
423 savetxt, load
424
425 Notes
426 -----
427 For a description of the ``.npy`` format, see `format`.
428
429 Examples
430 --------
431 >>> from tempfile import TemporaryFile
432 >>> outfile = TemporaryFile()
433
434 >>> x = np.arange(10)
435 >>> np.save(outfile, x)
436
437 >>> outfile.seek(0) # Only needed here to simulate closing & reopening file
438 >>> np.load(outfile)
439 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
440
441 """
442 own_fid = False
443 if isinstance(file, basestring):
444 if not file.endswith('.npy'):
445 file = file + '.npy'
446 fid = open(file, "wb")
447 own_fid = True
448 else:
449 fid = file
450
451 try:
452 arr = np.asanyarray(arr)
453 format.write_array(fid, arr)
454 finally:
455 if own_fid:
456 fid.close()
457
458
459 def savez(file, *args, **kwds):
460 """
461 Save several arrays into a single file in uncompressed ``.npz`` format.
462
463 If arguments are passed in with no keywords, the corresponding variable
464 names, in the ``.npz`` file, are 'arr_0', 'arr_1', etc. If keyword
465 arguments are given, the corresponding variable names, in the ``.npz``
466 file will match the keyword names.
467
468 Parameters
469 ----------
470 file : str or file
471 Either the file name (string) or an open file (file-like object)
472 where the data will be saved. If file is a string, the ``.npz``
473 extension will be appended to the file name if it is not already there.
474 args : Arguments, optional
475 Arrays to save to the file. Since it is not possible for Python to
476 know the names of the arrays outside `savez`, the arrays will be saved
477 with names "arr_0", "arr_1", and so on. These arguments can be any
478 expression.
479 kwds : Keyword arguments, optional
480 Arrays to save to the file. Arrays will be saved in the file with the
481 keyword names.
482
483 Returns
484 -------
485 None
486
487 See Also
488 --------
489 save : Save a single array to a binary file in NumPy format.
490 savetxt : Save an array to a file as plain text.
491 savez_compressed : Save several arrays into a compressed ``.npz`` archive
492
493 Notes
494 -----
495 The ``.npz`` file format is a zipped archive of files named after the
496 variables they contain. The archive is not compressed and each file
497 in the archive contains one variable in ``.npy`` format. For a
498 description of the ``.npy`` format, see `format`.
499
500 When opening the saved ``.npz`` file with `load` a `NpzFile` object is
501 returned. This is a dictionary-like object which can be queried for
502 its list of arrays (with the ``.files`` attribute), and for the arrays
503 themselves.
504
505 Examples
506 --------
507 >>> from tempfile import TemporaryFile
508 >>> outfile = TemporaryFile()
509 >>> x = np.arange(10)
510 >>> y = np.sin(x)
511
512 Using `savez` with \\*args, the arrays are saved with default names.
513
514 >>> np.savez(outfile, x, y)
515 >>> outfile.seek(0) # Only needed here to simulate closing & reopening file
516 >>> npzfile = np.load(outfile)
517 >>> npzfile.files
518 ['arr_1', 'arr_0']
519 >>> npzfile['arr_0']
520 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
521
522 Using `savez` with \\**kwds, the arrays are saved with the keyword names.
523
524 >>> outfile = TemporaryFile()
525 >>> np.savez(outfile, x=x, y=y)
526 >>> outfile.seek(0)
527 >>> npzfile = np.load(outfile)
528 >>> npzfile.files
529 ['y', 'x']
530 >>> npzfile['x']
531 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
532
533 """
534 _savez(file, args, kwds, False)
535
536
537 def savez_compressed(file, *args, **kwds):
538 """
539 Save several arrays into a single file in compressed ``.npz`` format.
540
541 If keyword arguments are given, then filenames are taken from the keywords.
542 If arguments are passed in with no keywords, then stored file names are
543 arr_0, arr_1, etc.
544
545 Parameters
546 ----------
547 file : str
548 File name of ``.npz`` file.
549 args : Arguments
550 Function arguments.
551 kwds : Keyword arguments
552 Keywords.
553
554 See Also
555 --------
556 numpy.savez : Save several arrays into an uncompressed ``.npz`` file format
557 numpy.load : Load the files created by savez_compressed.
558
559 """
560 _savez(file, args, kwds, True)
561
562
563 def _savez(file, args, kwds, compress):
564 # Import is postponed to here since zipfile depends on gzip, an optional
565 # component of the so-called standard library.
566 import zipfile
567 # Import deferred for startup time improvement
568 import tempfile
569
570 if isinstance(file, basestring):
571 if not file.endswith('.npz'):
572 file = file + '.npz'
573
574 namedict = kwds
575 for i, val in enumerate(args):
576 key = 'arr_%d' % i
577 if key in namedict.keys():
578 raise ValueError(
579 "Cannot use un-named variables and keyword %s" % key)
580 namedict[key] = val
581
582 if compress:
583 compression = zipfile.ZIP_DEFLATED
584 else:
585 compression = zipfile.ZIP_STORED
586
587 zipf = zipfile_factory(file, mode="w", compression=compression)
588
589 # Stage arrays in a temporary file on disk, before writing to zip.
590 fd, tmpfile = tempfile.mkstemp(suffix='-numpy.npy')
591 os.close(fd)
592 try:
593 for key, val in namedict.items():
594 fname = key + '.npy'
595 fid = open(tmpfile, 'wb')
596 try:
597 format.write_array(fid, np.asanyarray(val))
598 fid.close()
599 fid = None
600 zipf.write(tmpfile, arcname=fname)
601 finally:
602 if fid:
603 fid.close()
604 finally:
605 os.remove(tmpfile)
606
607 zipf.close()
608
609
610 def _getconv(dtype):
611 """ Find the correct dtype converter. Adapted from matplotlib """
612 typ = dtype.type
613 if issubclass(typ, np.bool_):
614 return lambda x: bool(int(x))
615 if issubclass(typ, np.uint64):
616 return np.uint64
617 if issubclass(typ, np.int64):
618 return np.int64
619 if issubclass(typ, np.integer):
620 return lambda x: int(float(x))
621 elif issubclass(typ, np.floating):
622 return float
623 elif issubclass(typ, np.complex):
624 return complex
625 elif issubclass(typ, np.bytes_):
626 return bytes
627 else:
628 return str
629
630
631 def loadtxt(fname, dtype=float, comments='#', delimiter=None,
632 converters=None, skiprows=0, usecols=None, unpack=False,
633 ndmin=0):
634 """
635 Load data from a text file.
636
637 Each row in the text file must have the same number of values.
638
639 Parameters
640 ----------
641 fname : file or str
642 File, filename, or generator to read. If the filename extension is
643 ``.gz`` or ``.bz2``, the file is first decompressed. Note that
644 generators should return byte strings for Python 3k.
645 dtype : data-type, optional
646 Data-type of the resulting array; default: float. If this is a
647 record data-type, the resulting array will be 1-dimensional, and
648 each row will be interpreted as an element of the array. In this
649 case, the number of columns used must match the number of fields in
650 the data-type.
651 comments : str, optional
652 The character used to indicate the start of a comment;
653 default: '#'.
654 delimiter : str, optional
655 The string used to separate values. By default, this is any
656 whitespace.
657 converters : dict, optional
658 A dictionary mapping column number to a function that will convert
659 that column to a float. E.g., if column 0 is a date string:
660 ``converters = {0: datestr2num}``. Converters can also be used to
661 provide a default value for missing data (but see also `genfromtxt`):
662 ``converters = {3: lambda s: float(s.strip() or 0)}``. Default: None.
663 skiprows : int, optional
664 Skip the first `skiprows` lines; default: 0.
665 usecols : sequence, optional
666 Which columns to read, with 0 being the first. For example,
667 ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns.
668 The default, None, results in all columns being read.
669 unpack : bool, optional
670 If True, the returned array is transposed, so that arguments may be
671 unpacked using ``x, y, z = loadtxt(...)``. When used with a record
672 data-type, arrays are returned for each field. Default is False.
673 ndmin : int, optional
674 The returned array will have at least `ndmin` dimensions.
675 Otherwise mono-dimensional axes will be squeezed.
676 Legal values: 0 (default), 1 or 2.
677
678 .. versionadded:: 1.6.0
679
680 Returns
681 -------
682 out : ndarray
683 Data read from the text file.
684
685 See Also
686 --------
687 load, fromstring, fromregex
688 genfromtxt : Load data with missing values handled as specified.
689 scipy.io.loadmat : reads MATLAB data files
690
691 Notes
692 -----
693 This function aims to be a fast reader for simply formatted files. The
694 `genfromtxt` function provides more sophisticated handling of, e.g.,
695 lines with missing values.
696
697 Examples
698 --------
699 >>> from StringIO import StringIO # StringIO behaves like a file object
700 >>> c = StringIO("0 1\\n2 3")
701 >>> np.loadtxt(c)
702 array([[ 0., 1.],
703 [ 2., 3.]])
704
705 >>> d = StringIO("M 21 72\\nF 35 58")
706 >>> np.loadtxt(d, dtype={'names': ('gender', 'age', 'weight'),
707 ... 'formats': ('S1', 'i4', 'f4')})
708 array([('M', 21, 72.0), ('F', 35, 58.0)],
709 dtype=[('gender', '|S1'), ('age', '<i4'), ('weight', '<f4')])
710
711 >>> c = StringIO("1,0,2\\n3,0,4")
712 >>> x, y = np.loadtxt(c, delimiter=',', usecols=(0, 2), unpack=True)
713 >>> x
714 array([ 1., 3.])
715 >>> y
716 array([ 2., 4.])
717
718 """
719 # Type conversions for Py3 convenience
720 comments = asbytes(comments)
721 user_converters = converters
722 if delimiter is not None:
723 delimiter = asbytes(delimiter)
724 if usecols is not None:
725 usecols = list(usecols)
726
727 fown = False
728 try:
729 if _is_string_like(fname):
730 fown = True
731 if fname.endswith('.gz'):
732 fh = iter(seek_gzip_factory(fname))
733 elif fname.endswith('.bz2'):
734 import bz2
735 fh = iter(bz2.BZ2File(fname))
736 elif sys.version_info[0] == 2:
737 fh = iter(open(fname, 'U'))
738 else:
739 fh = iter(open(fname))
740 else:
741 fh = iter(fname)
742 except TypeError:
743 raise ValueError('fname must be a string, file handle, or generator')
744 X = []
745
746 def flatten_dtype(dt):
747 """Unpack a structured data-type, and produce re-packing info."""
748 if dt.names is None:
749 # If the dtype is flattened, return.
750 # If the dtype has a shape, the dtype occurs
751 # in the list more than once.
752 shape = dt.shape
753 if len(shape) == 0:
754 return ([dt.base], None)
755 else:
756 packing = [(shape[-1], list)]
757 if len(shape) > 1:
758 for dim in dt.shape[-2::-1]:
759 packing = [(dim*packing[0][0], packing*dim)]
760 return ([dt.base] * int(np.prod(dt.shape)), packing)
761 else:
762 types = []
763 packing = []
764 for field in dt.names:
765 tp, bytes = dt.fields[field]
766 flat_dt, flat_packing = flatten_dtype(tp)
767 types.extend(flat_dt)
768 # Avoid extra nesting for subarrays
769 if len(tp.shape) > 0:
770 packing.extend(flat_packing)
771 else:
772 packing.append((len(flat_dt), flat_packing))
773 return (types, packing)
774
775 def pack_items(items, packing):
776 """Pack items into nested lists based on re-packing info."""
777 if packing is None:
778 return items[0]
779 elif packing is tuple:
780 return tuple(items)
781 elif packing is list:
782 return list(items)
783 else:
784 start = 0
785 ret = []
786 for length, subpacking in packing:
787 ret.append(pack_items(items[start:start+length], subpacking))
788 start += length
789 return tuple(ret)
790
791 def split_line(line):
792 """Chop off comments, strip, and split at delimiter."""
793 line = asbytes(line).split(comments)[0].strip(asbytes('\r\n'))
794 if line:
795 return line.split(delimiter)
796 else:
797 return []
798
799 try:
800 # Make sure we're dealing with a proper dtype
801 dtype = np.dtype(dtype)
802 defconv = _getconv(dtype)
803
804 # Skip the first `skiprows` lines
805 for i in range(skiprows):
806 next(fh)
807
808 # Read until we find a line with some values, and use
809 # it to estimate the number of columns, N.
810 first_vals = None
811 try:
812 while not first_vals:
813 first_line = next(fh)
814 first_vals = split_line(first_line)
815 except StopIteration:
816 # End of lines reached
817 first_line = ''
818 first_vals = []
819 warnings.warn('loadtxt: Empty input file: "%s"' % fname)
820 N = len(usecols or first_vals)
821
822 dtype_types, packing = flatten_dtype(dtype)
823 if len(dtype_types) > 1:
824 # We're dealing with a structured array, each field of
825 # the dtype matches a column
826 converters = [_getconv(dt) for dt in dtype_types]
827 else:
828 # All fields have the same dtype
829 converters = [defconv for i in range(N)]
830 if N > 1:
831 packing = [(N, tuple)]
832
833 # By preference, use the converters specified by the user
834 for i, conv in (user_converters or {}).items():
835 if usecols:
836 try:
837 i = usecols.index(i)
838 except ValueError:
839 # Unused converter specified
840 continue
841 converters[i] = conv
842
843 # Parse each line, including the first
844 for i, line in enumerate(itertools.chain([first_line], fh)):
845 vals = split_line(line)
846 if len(vals) == 0:
847 continue
848 if usecols:
849 vals = [vals[i] for i in usecols]
850 if len(vals) != N:
851 line_num = i + skiprows + 1
852 raise ValueError("Wrong number of columns at line %d"
853 % line_num)
854
855 # Convert each value according to its column and store
856 items = [conv(val) for (conv, val) in zip(converters, vals)]
857 # Then pack it according to the dtype's nesting
858 items = pack_items(items, packing)
859 X.append(items)
860 finally:
861 if fown:
862 fh.close()
863
864 X = np.array(X, dtype)
865 # Multicolumn data are returned with shape (1, N, M), i.e.
866 # (1, 1, M) for a single row - remove the singleton dimension there
867 if X.ndim == 3 and X.shape[:2] == (1, 1):
868 X.shape = (1, -1)
869
870 # Verify that the array has at least dimensions `ndmin`.
871 # Check correctness of the values of `ndmin`
872 if ndmin not in [0, 1, 2]:
873 raise ValueError('Illegal value of ndmin keyword: %s' % ndmin)
874 # Tweak the size and shape of the arrays - remove extraneous dimensions
875 if X.ndim > ndmin:
876 X = np.squeeze(X)
877 # and ensure we have the minimum number of dimensions asked for
878 # - has to be in this order for the odd case ndmin=1, X.squeeze().ndim=0
879 if X.ndim < ndmin:
880 if ndmin == 1:
881 X = np.atleast_1d(X)
882 elif ndmin == 2:
883 X = np.atleast_2d(X).T
884
885 if unpack:
886 if len(dtype_types) > 1:
887 # For structured arrays, return an array for each field.
888 return [X[field] for field in dtype.names]
889 else:
890 return X.T
891 else:
892 return X
893
894
895 def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
896 footer='', comments='# '):
897 """
898 Save an array to a text file.
899
900 Parameters
901 ----------
902 fname : filename or file handle
903 If the filename ends in ``.gz``, the file is automatically saved in
904 compressed gzip format. `loadtxt` understands gzipped files
905 transparently.
906 X : array_like
907 Data to be saved to a text file.
908 fmt : str or sequence of strs, optional
909 A single format (%10.5f), a sequence of formats, or a
910 multi-format string, e.g. 'Iteration %d -- %10.5f', in which
911 case `delimiter` is ignored. For complex `X`, the legal options
912 for `fmt` are:
913 a) a single specifier, `fmt='%.4e'`, resulting in numbers formatted
914 like `' (%s+%sj)' % (fmt, fmt)`
915 b) a full string specifying every real and imaginary part, e.g.
916 `' %.4e %+.4j %.4e %+.4j %.4e %+.4j'` for 3 columns
917 c) a list of specifiers, one per column - in this case, the real
918 and imaginary part must have separate specifiers,
919 e.g. `['%.3e + %.3ej', '(%.15e%+.15ej)']` for 2 columns
920 delimiter : str, optional
921 String or character separating columns.
922 newline : str, optional
923 String or character separating lines.
924
925 .. versionadded:: 1.5.0
926 header : str, optional
927 String that will be written at the beginning of the file.
928
929 .. versionadded:: 1.7.0
930 footer : str, optional
931 String that will be written at the end of the file.
932
933 .. versionadded:: 1.7.0
934 comments : str, optional
935 String that will be prepended to the ``header`` and ``footer`` strings,
936 to mark them as comments. Default: '# ', as expected by e.g.
937 ``numpy.loadtxt``.
938
939 .. versionadded:: 1.7.0
940
941
942 See Also
943 --------
944 save : Save an array to a binary file in NumPy ``.npy`` format
945 savez : Save several arrays into an uncompressed ``.npz`` archive
946 savez_compressed : Save several arrays into a compressed ``.npz`` archive
947
948 Notes
949 -----
950 Further explanation of the `fmt` parameter
951 (``%[flag]width[.precision]specifier``):
952
953 flags:
954 ``-`` : left justify
955
956 ``+`` : Forces to precede result with + or -.
957
958 ``0`` : Left pad the number with zeros instead of space (see width).
959
960 width:
961 Minimum number of characters to be printed. The value is not truncated
962 if it has more characters.
963
964 precision:
965 - For integer specifiers (eg. ``d,i,o,x``), the minimum number of
966 digits.
967 - For ``e, E`` and ``f`` specifiers, the number of digits to print
968 after the decimal point.
969 - For ``g`` and ``G``, the maximum number of significant digits.
970 - For ``s``, the maximum number of characters.
971
972 specifiers:
973 ``c`` : character
974
975 ``d`` or ``i`` : signed decimal integer
976
977 ``e`` or ``E`` : scientific notation with ``e`` or ``E``.
978
979 ``f`` : decimal floating point
980
981 ``g,G`` : use the shorter of ``e,E`` or ``f``
982
983 ``o`` : signed octal
984
985 ``s`` : string of characters
986
987 ``u`` : unsigned decimal integer
988
989 ``x,X`` : unsigned hexadecimal integer
990
991 This explanation of ``fmt`` is not complete, for an exhaustive
992 specification see [1]_.
993
994 References
995 ----------
996 .. [1] `Format Specification Mini-Language
997 <http://docs.python.org/library/string.html#
998 format-specification-mini-language>`_, Python Documentation.
999
1000 Examples
1001 --------
1002 >>> x = y = z = np.arange(0.0,5.0,1.0)
1003 >>> np.savetxt('test.out', x, delimiter=',') # X is an array
1004 >>> np.savetxt('test.out', (x,y,z)) # x,y,z equal sized 1D arrays
1005 >>> np.savetxt('test.out', x, fmt='%1.4e') # use exponential notation
1006
1007 """
1008
1009 # Py3 conversions first
1010 if isinstance(fmt, bytes):
1011 fmt = asstr(fmt)
1012 delimiter = asstr(delimiter)
1013
1014 own_fh = False
1015 if _is_string_like(fname):
1016 own_fh = True
1017 if fname.endswith('.gz'):
1018 import gzip
1019 fh = gzip.open(fname, 'wb')
1020 else:
1021 if sys.version_info[0] >= 3:
1022 fh = open(fname, 'wb')
1023 else:
1024 fh = open(fname, 'w')
1025 elif hasattr(fname, 'write'):
1026 fh = fname
1027 else:
1028 raise ValueError('fname must be a string or file handle')
1029
1030 try:
1031 X = np.asarray(X)
1032
1033 # Handle 1-dimensional arrays
1034 if X.ndim == 1:
1035 # Common case -- 1d array of numbers
1036 if X.dtype.names is None:
1037 X = np.atleast_2d(X).T
1038 ncol = 1
1039
1040 # Complex dtype -- each field indicates a separate column
1041 else:
1042 ncol = len(X.dtype.descr)
1043 else:
1044 ncol = X.shape[1]
1045
1046 iscomplex_X = np.iscomplexobj(X)
1047 # `fmt` can be a string with multiple insertion points or a
1048 # list of formats. E.g. '%10.5f\t%10d' or ('%10.5f', '$10d')
1049 if type(fmt) in (list, tuple):
1050 if len(fmt) != ncol:
1051 raise AttributeError('fmt has wrong shape. %s' % str(fmt))
1052 format = asstr(delimiter).join(map(asstr, fmt))
1053 elif isinstance(fmt, str):
1054 n_fmt_chars = fmt.count('%')
1055 error = ValueError('fmt has wrong number of %% formats: %s' % fmt)
1056 if n_fmt_chars == 1:
1057 if iscomplex_X:
1058 fmt = [' (%s+%sj)' % (fmt, fmt), ] * ncol
1059 else:
1060 fmt = [fmt, ] * ncol
1061 format = delimiter.join(fmt)
1062 elif iscomplex_X and n_fmt_chars != (2 * ncol):
1063 raise error
1064 elif ((not iscomplex_X) and n_fmt_chars != ncol):
1065 raise error
1066 else:
1067 format = fmt
1068 else:
1069 raise ValueError('invalid fmt: %r' % (fmt,))
1070
1071 if len(header) > 0:
1072 header = header.replace('\n', '\n' + comments)
1073 fh.write(asbytes(comments + header + newline))
1074 if iscomplex_X:
1075 for row in X:
1076 row2 = []
1077 for number in row:
1078 row2.append(number.real)
1079 row2.append(number.imag)
1080 fh.write(asbytes(format % tuple(row2) + newline))
1081 else:
1082 for row in X:
1083 fh.write(asbytes(format % tuple(row) + newline))
1084 if len(footer) > 0:
1085 footer = footer.replace('\n', '\n' + comments)
1086 fh.write(asbytes(comments + footer + newline))
1087 finally:
1088 if own_fh:
1089 fh.close()
1090
1091
1092 def fromregex(file, regexp, dtype):
1093 """
1094 Construct an array from a text file, using regular expression parsing.
1095
1096 The returned array is always a structured array, and is constructed from
1097 all matches of the regular expression in the file. Groups in the regular
1098 expression are converted to fields of the structured array.
1099
1100 Parameters
1101 ----------
1102 file : str or file
1103 File name or file object to read.
1104 regexp : str or regexp
1105 Regular expression used to parse the file.
1106 Groups in the regular expression correspond to fields in the dtype.
1107 dtype : dtype or list of dtypes
1108 Dtype for the structured array.
1109
1110 Returns
1111 -------
1112 output : ndarray
1113 The output array, containing the part of the content of `file` that
1114 was matched by `regexp`. `output` is always a structured array.
1115
1116 Raises
1117 ------
1118 TypeError
1119 When `dtype` is not a valid dtype for a structured array.
1120
1121 See Also
1122 --------
1123 fromstring, loadtxt
1124
1125 Notes
1126 -----
1127 Dtypes for structured arrays can be specified in several forms, but all
1128 forms specify at least the data type and field name. For details see
1129 `doc.structured_arrays`.
1130
1131 Examples
1132 --------
1133 >>> f = open('test.dat', 'w')
1134 >>> f.write("1312 foo\\n1534 bar\\n444 qux")
1135 >>> f.close()
1136
1137 >>> regexp = r"(\\d+)\\s+(...)" # match [digits, whitespace, anything]
1138 >>> output = np.fromregex('test.dat', regexp,
1139 ... [('num', np.int64), ('key', 'S3')])
1140 >>> output
1141 array([(1312L, 'foo'), (1534L, 'bar'), (444L, 'qux')],
1142 dtype=[('num', '<i8'), ('key', '|S3')])
1143 >>> output['num']
1144 array([1312, 1534, 444], dtype=int64)
1145
1146 """
1147 own_fh = False
1148 if not hasattr(file, "read"):
1149 file = open(file, 'rb')
1150 own_fh = True
1151
1152 try:
1153 if not hasattr(regexp, 'match'):
1154 regexp = re.compile(asbytes(regexp))
1155 if not isinstance(dtype, np.dtype):
1156 dtype = np.dtype(dtype)
1157
1158 seq = regexp.findall(file.read())
1159 if seq and not isinstance(seq[0], tuple):
1160 # Only one group is in the regexp.
1161 # Create the new array as a single data-type and then
1162 # re-interpret as a single-field structured array.
1163 newdtype = np.dtype(dtype[dtype.names[0]])
1164 output = np.array(seq, dtype=newdtype)
1165 output.dtype = dtype
1166 else:
1167 output = np.array(seq, dtype=dtype)
1168
1169 return output
1170 finally:
1171 if own_fh:
1172 file.close()
1173
1174
1175 #####--------------------------------------------------------------------------
1176 #---- --- ASCII functions ---
1177 #####--------------------------------------------------------------------------
1178
1179
1180 def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
1181 skiprows=0, skip_header=0, skip_footer=0, converters=None,
1182 missing='', missing_values=None, filling_values=None,
1183 usecols=None, names=None,
1184 excludelist=None, deletechars=None, replace_space='_',
1185 autostrip=False, case_sensitive=True, defaultfmt="f%i",
1186 unpack=None, usemask=False, loose=True, invalid_raise=True):
1187 """
1188 Load data from a text file, with missing values handled as specified.
1189
1190 Each line past the first `skip_header` lines is split at the `delimiter`
1191 character, and characters following the `comments` character are discarded.
1192
1193 Parameters
1194 ----------
1195 fname : file or str
1196 File, filename, or generator to read. If the filename extension is
1197 `.gz` or `.bz2`, the file is first decompressed. Note that
1198 generators must return byte strings in Python 3k.
1199 dtype : dtype, optional
1200 Data type of the resulting array.
1201 If None, the dtypes will be determined by the contents of each
1202 column, individually.
1203 comments : str, optional
1204 The character used to indicate the start of a comment.
1205 All the characters occurring on a line after a comment are discarded
1206 delimiter : str, int, or sequence, optional
1207 The string used to separate values. By default, any consecutive
1208 whitespaces act as delimiter. An integer or sequence of integers
1209 can also be provided as width(s) of each field.
1210 skip_rows : int, optional
1211 `skip_rows` was deprecated in numpy 1.5, and will be removed in
1212 numpy 2.0. Please use `skip_header` instead.
1213 skip_header : int, optional
1214 The number of lines to skip at the beginning of the file.
1215 skip_footer : int, optional
1216 The number of lines to skip at the end of the file.
1217 converters : variable, optional
1218 The set of functions that convert the data of a column to a value.
1219 The converters can also be used to provide a default value
1220 for missing data: ``converters = {3: lambda s: float(s or 0)}``.
1221 missing : variable, optional
1222 `missing` was deprecated in numpy 1.5, and will be removed in
1223 numpy 2.0. Please use `missing_values` instead.
1224 missing_values : variable, optional
1225 The set of strings corresponding to missing data.
1226 filling_values : variable, optional
1227 The set of values to be used as default when the data are missing.
1228 usecols : sequence, optional
1229 Which columns to read, with 0 being the first. For example,
1230 ``usecols = (1, 4, 5)`` will extract the 2nd, 5th and 6th columns.
1231 names : {None, True, str, sequence}, optional
1232 If `names` is True, the field names are read from the first valid line
1233 after the first `skip_header` lines.
1234 If `names` is a sequence or a single-string of comma-separated names,
1235 the names will be used to define the field names in a structured dtype.
1236 If `names` is None, the names of the dtype fields will be used, if any.
1237 excludelist : sequence, optional
1238 A list of names to exclude. This list is appended to the default list
1239 ['return','file','print']. Excluded names are appended an underscore:
1240 for example, `file` would become `file_`.
1241 deletechars : str, optional
1242 A string combining invalid characters that must be deleted from the
1243 names.
1244 defaultfmt : str, optional
1245 A format used to define default field names, such as "f%i" or "f_%02i".
1246 autostrip : bool, optional
1247 Whether to automatically strip white spaces from the variables.
1248 replace_space : char, optional
1249 Character(s) used in replacement of white spaces in the variables
1250 names. By default, use a '_'.
1251 case_sensitive : {True, False, 'upper', 'lower'}, optional
1252 If True, field names are case sensitive.
1253 If False or 'upper', field names are converted to upper case.
1254 If 'lower', field names are converted to lower case.
1255 unpack : bool, optional
1256 If True, the returned array is transposed, so that arguments may be
1257 unpacked using ``x, y, z = loadtxt(...)``
1258 usemask : bool, optional
1259 If True, return a masked array.
1260 If False, return a regular array.
1261 loose : bool, optional
1262 If True, do not raise errors for invalid values.
1263 invalid_raise : bool, optional
1264 If True, an exception is raised if an inconsistency is detected in the
1265 number of columns.
1266 If False, a warning is emitted and the offending lines are skipped.
1267
1268 Returns
1269 -------
1270 out : ndarray
1271 Data read from the text file. If `usemask` is True, this is a
1272 masked array.
1273
1274 See Also
1275 --------
1276 numpy.loadtxt : equivalent function when no data is missing.
1277
1278 Notes
1279 -----
1280 * When spaces are used as delimiters, or when no delimiter has been given
1281 as input, there should not be any missing data between two fields.
1282 * When the variables are named (either by a flexible dtype or with `names`,
1283 there must not be any header in the file (else a ValueError
1284 exception is raised).
1285 * Individual values are not stripped of spaces by default.
1286 When using a custom converter, make sure the function does remove spaces.
1287
1288 References
1289 ----------
1290 .. [1] Numpy User Guide, section `I/O with Numpy
1291 <http://docs.scipy.org/doc/numpy/user/basics.io.genfromtxt.html>`_.
1292
1293 Examples
1294 ---------
1295 >>> from StringIO import StringIO
1296 >>> import numpy as np
1297
1298 Comma delimited file with mixed dtype
1299
1300 >>> s = StringIO("1,1.3,abcde")
1301 >>> data = np.genfromtxt(s, dtype=[('myint','i8'),('myfloat','f8'),
1302 ... ('mystring','S5')], delimiter=",")
1303 >>> data
1304 array((1, 1.3, 'abcde'),
1305 dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', '|S5')])
1306
1307 Using dtype = None
1308
1309 >>> s.seek(0) # needed for StringIO example only
1310 >>> data = np.genfromtxt(s, dtype=None,
1311 ... names = ['myint','myfloat','mystring'], delimiter=",")
1312 >>> data
1313 array((1, 1.3, 'abcde'),
1314 dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', '|S5')])
1315
1316 Specifying dtype and names
1317
1318 >>> s.seek(0)
1319 >>> data = np.genfromtxt(s, dtype="i8,f8,S5",
1320 ... names=['myint','myfloat','mystring'], delimiter=",")
1321 >>> data
1322 array((1, 1.3, 'abcde'),
1323 dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', '|S5')])
1324
1325 An example with fixed-width columns
1326
1327 >>> s = StringIO("11.3abcde")
1328 >>> data = np.genfromtxt(s, dtype=None, names=['intvar','fltvar','strvar'],
1329 ... delimiter=[1,3,5])
1330 >>> data
1331 array((1, 1.3, 'abcde'),
1332 dtype=[('intvar', '<i8'), ('fltvar', '<f8'), ('strvar', '|S5')])
1333
1334 """
1335 # Py3 data conversions to bytes, for convenience
1336 if comments is not None:
1337 comments = asbytes(comments)
1338 if isinstance(delimiter, unicode):
1339 delimiter = asbytes(delimiter)
1340 if isinstance(missing, unicode):
1341 missing = asbytes(missing)
1342 if isinstance(missing_values, (unicode, list, tuple)):
1343 missing_values = asbytes_nested(missing_values)
1344
1345 #
1346 if usemask:
1347 from numpy.ma import MaskedArray, make_mask_descr
1348 # Check the input dictionary of converters
1349 user_converters = converters or {}
1350 if not isinstance(user_converters, dict):
1351 raise TypeError(
1352 "The input argument 'converter' should be a valid dictionary "
1353 "(got '%s' instead)" % type(user_converters))
1354
1355 # Initialize the filehandle, the LineSplitter and the NameValidator
1356 own_fhd = False
1357 try:
1358 if isinstance(fname, basestring):
1359 if sys.version_info[0] == 2:
1360 fhd = iter(np.lib._datasource.open(fname, 'rbU'))
1361 else:
1362 fhd = iter(np.lib._datasource.open(fname, 'rb'))
1363 own_fhd = True
1364 else:
1365 fhd = iter(fname)
1366 except TypeError:
1367 raise TypeError(
1368 "fname must be a string, filehandle, or generator. "
1369 "(got %s instead)" % type(fname))
1370
1371 split_line = LineSplitter(delimiter=delimiter, comments=comments,
1372 autostrip=autostrip)._handyman
1373 validate_names = NameValidator(excludelist=excludelist,
1374 deletechars=deletechars,
1375 case_sensitive=case_sensitive,
1376 replace_space=replace_space)
1377
1378 # Get the first valid lines after the first skiprows ones ..
1379 if skiprows:
1380 warnings.warn(
1381 "The use of `skiprows` is deprecated, it will be removed in "
1382 "numpy 2.0.\nPlease use `skip_header` instead.",
1383 DeprecationWarning)
1384 skip_header = skiprows
1385 # Skip the first `skip_header` rows
1386 for i in range(skip_header):
1387 next(fhd)
1388
1389 # Keep on until we find the first valid values
1390 first_values = None
1391 try:
1392 while not first_values:
1393 first_line = next(fhd)
1394 if names is True:
1395 if comments in first_line:
1396 first_line = (
1397 asbytes('').join(first_line.split(comments)[1:]))
1398 first_values = split_line(first_line)
1399 except StopIteration:
1400 # return an empty array if the datafile is empty
1401 first_line = asbytes('')
1402 first_values = []
1403 warnings.warn('genfromtxt: Empty input file: "%s"' % fname)
1404
1405 # Should we take the first values as names ?
1406 if names is True:
1407 fval = first_values[0].strip()
1408 if fval in comments:
1409 del first_values[0]
1410
1411 # Check the columns to use: make sure `usecols` is a list
1412 if usecols is not None:
1413 try:
1414 usecols = [_.strip() for _ in usecols.split(",")]
1415 except AttributeError:
1416 try:
1417 usecols = list(usecols)
1418 except TypeError:
1419 usecols = [usecols, ]
1420 nbcols = len(usecols or first_values)
1421
1422 # Check the names and overwrite the dtype.names if needed
1423 if names is True:
1424 names = validate_names([_bytes_to_name(_.strip())
1425 for _ in first_values])
1426 first_line = asbytes('')
1427 elif _is_string_like(names):
1428 names = validate_names([_.strip() for _ in names.split(',')])
1429 elif names:
1430 names = validate_names(names)
1431 # Get the dtype
1432 if dtype is not None:
1433 dtype = easy_dtype(dtype, defaultfmt=defaultfmt, names=names)
1434 # Make sure the names is a list (for 2.5)
1435 if names is not None:
1436 names = list(names)
1437
1438 if usecols:
1439 for (i, current) in enumerate(usecols):
1440 # if usecols is a list of names, convert to a list of indices
1441 if _is_string_like(current):
1442 usecols[i] = names.index(current)
1443 elif current < 0:
1444 usecols[i] = current + len(first_values)
1445 # If the dtype is not None, make sure we update it
1446 if (dtype is not None) and (len(dtype) > nbcols):
1447 descr = dtype.descr
1448 dtype = np.dtype([descr[_] for _ in usecols])
1449 names = list(dtype.names)
1450 # If `names` is not None, update the names
1451 elif (names is not None) and (len(names) > nbcols):
1452 names = [names[_] for _ in usecols]
1453 elif (names is not None) and (dtype is not None):
1454 names = list(dtype.names)
1455
1456 # Process the missing values ...............................
1457 # Rename missing_values for convenience
1458 user_missing_values = missing_values or ()
1459
1460 # Define the list of missing_values (one column: one list)
1461 missing_values = [list([asbytes('')]) for _ in range(nbcols)]
1462
1463 # We have a dictionary: process it field by field
1464 if isinstance(user_missing_values, dict):
1465 # Loop on the items
1466 for (key, val) in user_missing_values.items():
1467 # Is the key a string ?
1468 if _is_string_like(key):
1469 try:
1470 # Transform it into an integer
1471 key = names.index(key)
1472 except ValueError:
1473 # We couldn't find it: the name must have been dropped
1474 continue
1475 # Redefine the key as needed if it's a column number
1476 if usecols:
1477 try:
1478 key = usecols.index(key)
1479 except ValueError:
1480 pass
1481 # Transform the value as a list of string
1482 if isinstance(val, (list, tuple)):
1483 val = [str(_) for _ in val]
1484 else:
1485 val = [str(val), ]
1486 # Add the value(s) to the current list of missing
1487 if key is None:
1488 # None acts as default
1489 for miss in missing_values:
1490 miss.extend(val)
1491 else:
1492 missing_values[key].extend(val)
1493 # We have a sequence : each item matches a column
1494 elif isinstance(user_missing_values, (list, tuple)):
1495 for (value, entry) in zip(user_missing_values, missing_values):
1496 value = str(value)
1497 if value not in entry:
1498 entry.append(value)
1499 # We have a string : apply it to all entries
1500 elif isinstance(user_missing_values, bytes):
1501 user_value = user_missing_values.split(asbytes(","))
1502 for entry in missing_values:
1503 entry.extend(user_value)
1504 # We have something else: apply it to all entries
1505 else:
1506 for entry in missing_values:
1507 entry.extend([str(user_missing_values)])
1508
1509 # Process the deprecated `missing`
1510 if missing != asbytes(''):
1511 warnings.warn(
1512 "The use of `missing` is deprecated, it will be removed in "
1513 "Numpy 2.0.\nPlease use `missing_values` instead.",
1514 DeprecationWarning)
1515 values = [str(_) for _ in missing.split(asbytes(","))]
1516 for entry in missing_values:
1517 entry.extend(values)
1518
1519 # Process the filling_values ...............................
1520 # Rename the input for convenience
1521 user_filling_values = filling_values
1522 if user_filling_values is None:
1523 user_filling_values = []
1524 # Define the default
1525 filling_values = [None] * nbcols
1526 # We have a dictionary : update each entry individually
1527 if isinstance(user_filling_values, dict):
1528 for (key, val) in user_filling_values.items():
1529 if _is_string_like(key):
1530 try:
1531 # Transform it into an integer
1532 key = names.index(key)
1533 except ValueError:
1534 # We couldn't find it: the name must have been dropped,
1535 continue
1536 # Redefine the key if it's a column number and usecols is defined
1537 if usecols:
1538 try:
1539 key = usecols.index(key)
1540 except ValueError:
1541 pass
1542 # Add the value to the list
1543 filling_values[key] = val
1544 # We have a sequence : update on a one-to-one basis
1545 elif isinstance(user_filling_values, (list, tuple)):
1546 n = len(user_filling_values)
1547 if (n <= nbcols):
1548 filling_values[:n] = user_filling_values
1549 else:
1550 filling_values = user_filling_values[:nbcols]
1551 # We have something else : use it for all entries
1552 else:
1553 filling_values = [user_filling_values] * nbcols
1554
1555 # Initialize the converters ................................
1556 if dtype is None:
1557 # Note: we can't use a [...]*nbcols, as we would have 3 times the same
1558 # ... converter, instead of 3 different converters.
1559 converters = [StringConverter(None, missing_values=miss, default=fill)
1560 for (miss, fill) in zip(missing_values, filling_values)]
1561 else:
1562 dtype_flat = flatten_dtype(dtype, flatten_base=True)
1563 # Initialize the converters
1564 if len(dtype_flat) > 1:
1565 # Flexible type : get a converter from each dtype
1566 zipit = zip(dtype_flat, missing_values, filling_values)
1567 converters = [StringConverter(dt, locked=True,
1568 missing_values=miss, default=fill)
1569 for (dt, miss, fill) in zipit]
1570 else:
1571 # Set to a default converter (but w/ different missing values)
1572 zipit = zip(missing_values, filling_values)
1573 converters = [StringConverter(dtype, locked=True,
1574 missing_values=miss, default=fill)
1575 for (miss, fill) in zipit]
1576 # Update the converters to use the user-defined ones
1577 uc_update = []
1578 for (j, conv) in user_converters.items():
1579 # If the converter is specified by column names, use the index instead
1580 if _is_string_like(j):
1581 try:
1582 j = names.index(j)
1583 i = j
1584 except ValueError:
1585 continue
1586 elif usecols:
1587 try:
1588 i = usecols.index(j)
1589 except ValueError:
1590 # Unused converter specified
1591 continue
1592 else:
1593 i = j
1594 # Find the value to test - first_line is not filtered by usecols:
1595 if len(first_line):
1596 testing_value = first_values[j]
1597 else:
1598 testing_value = None
1599 converters[i].update(conv, locked=True,
1600 testing_value=testing_value,
1601 default=filling_values[i],
1602 missing_values=missing_values[i],)
1603 uc_update.append((i, conv))
1604 # Make sure we have the corrected keys in user_converters...
1605 user_converters.update(uc_update)
1606
1607 # Fixme: possible error as following variable never used.
1608 #miss_chars = [_.missing_values for _ in converters]
1609
1610 # Initialize the output lists ...
1611 # ... rows
1612 rows = []
1613 append_to_rows = rows.append
1614 # ... masks
1615 if usemask:
1616 masks = []
1617 append_to_masks = masks.append
1618 # ... invalid
1619 invalid = []
1620 append_to_invalid = invalid.append
1621
1622 # Parse each line
1623 for (i, line) in enumerate(itertools.chain([first_line, ], fhd)):
1624 values = split_line(line)
1625 nbvalues = len(values)
1626 # Skip an empty line
1627 if nbvalues == 0:
1628 continue
1629 # Select only the columns we need
1630 if usecols:
1631 try:
1632 values = [values[_] for _ in usecols]
1633 except IndexError:
1634 append_to_invalid((i + skip_header + 1, nbvalues))
1635 continue
1636 elif nbvalues != nbcols:
1637 append_to_invalid((i + skip_header + 1, nbvalues))
1638 continue
1639 # Store the values
1640 append_to_rows(tuple(values))
1641 if usemask:
1642 append_to_masks(tuple([v.strip() in m
1643 for (v, m) in zip(values, missing_values)]))
1644
1645 if own_fhd:
1646 fhd.close()
1647
1648 # Upgrade the converters (if needed)
1649 if dtype is None:
1650 for (i, converter) in enumerate(converters):
1651 current_column = [itemgetter(i)(_m) for _m in rows]
1652 try:
1653 converter.iterupgrade(current_column)
1654 except ConverterLockError:
1655 errmsg = "Converter #%i is locked and cannot be upgraded: " % i
1656 current_column = map(itemgetter(i), rows)
1657 for (j, value) in enumerate(current_column):
1658 try:
1659 converter.upgrade(value)
1660 except (ConverterError, ValueError):
1661 errmsg += "(occurred line #%i for value '%s')"
1662 errmsg %= (j + 1 + skip_header, value)
1663 raise ConverterError(errmsg)
1664
1665 # Check that we don't have invalid values
1666 nbinvalid = len(invalid)
1667 if nbinvalid > 0:
1668 nbrows = len(rows) + nbinvalid - skip_footer
1669 # Construct the error message
1670 template = " Line #%%i (got %%i columns instead of %i)" % nbcols
1671 if skip_footer > 0:
1672 nbinvalid_skipped = len([_ for _ in invalid
1673 if _[0] > nbrows + skip_header])
1674 invalid = invalid[:nbinvalid - nbinvalid_skipped]
1675 skip_footer -= nbinvalid_skipped
1676 #
1677 # nbrows -= skip_footer
1678 # errmsg = [template % (i, nb)
1679 # for (i, nb) in invalid if i < nbrows]
1680 # else:
1681 errmsg = [template % (i, nb)
1682 for (i, nb) in invalid]
1683 if len(errmsg):
1684 errmsg.insert(0, "Some errors were detected !")
1685 errmsg = "\n".join(errmsg)
1686 # Raise an exception ?
1687 if invalid_raise:
1688 raise ValueError(errmsg)
1689 # Issue a warning ?
1690 else:
1691 warnings.warn(errmsg, ConversionWarning)
1692
1693 # Strip the last skip_footer data
1694 if skip_footer > 0:
1695 rows = rows[:-skip_footer]
1696 if usemask:
1697 masks = masks[:-skip_footer]
1698
1699 # Convert each value according to the converter:
1700 # We want to modify the list in place to avoid creating a new one...
1701 if loose:
1702 rows = list(
1703 zip(*[[conv._loose_call(_r) for _r in map(itemgetter(i), rows)]
1704 for (i, conv) in enumerate(converters)]))
1705 else:
1706 rows = list(
1707 zip(*[[conv._strict_call(_r) for _r in map(itemgetter(i), rows)]
1708 for (i, conv) in enumerate(converters)]))
1709
1710 # Reset the dtype
1711 data = rows
1712 if dtype is None:
1713 # Get the dtypes from the types of the converters
1714 column_types = [conv.type for conv in converters]
1715 # Find the columns with strings...
1716 strcolidx = [i for (i, v) in enumerate(column_types)
1717 if v in (type('S'), np.string_)]
1718 # ... and take the largest number of chars.
1719 for i in strcolidx:
1720 column_types[i] = "|S%i" % max(len(row[i]) for row in data)
1721 #
1722 if names is None:
1723 # If the dtype is uniform, don't define names, else use ''
1724 base = set([c.type for c in converters if c._checked])
1725 if len(base) == 1:
1726 (ddtype, mdtype) = (list(base)[0], np.bool)
1727 else:
1728 ddtype = [(defaultfmt % i, dt)
1729 for (i, dt) in enumerate(column_types)]
1730 if usemask:
1731 mdtype = [(defaultfmt % i, np.bool)
1732 for (i, dt) in enumerate(column_types)]
1733 else:
1734 ddtype = list(zip(names, column_types))
1735 mdtype = list(zip(names, [np.bool] * len(column_types)))
1736 output = np.array(data, dtype=ddtype)
1737 if usemask:
1738 outputmask = np.array(masks, dtype=mdtype)
1739 else:
1740 # Overwrite the initial dtype names if needed
1741 if names and dtype.names:
1742 dtype.names = names
1743 # Case 1. We have a structured type
1744 if len(dtype_flat) > 1:
1745 # Nested dtype, eg [('a', int), ('b', [('b0', int), ('b1', 'f4')])]
1746 # First, create the array using a flattened dtype:
1747 # [('a', int), ('b1', int), ('b2', float)]
1748 # Then, view the array using the specified dtype.
1749 if 'O' in (_.char for _ in dtype_flat):
1750 if has_nested_fields(dtype):
1751 raise NotImplementedError(
1752 "Nested fields involving objects are not supported...")
1753 else:
1754 output = np.array(data, dtype=dtype)
1755 else:
1756 rows = np.array(data, dtype=[('', _) for _ in dtype_flat])
1757 output = rows.view(dtype)
1758 # Now, process the rowmasks the same way
1759 if usemask:
1760 rowmasks = np.array(
1761 masks, dtype=np.dtype([('', np.bool) for t in dtype_flat]))
1762 # Construct the new dtype
1763 mdtype = make_mask_descr(dtype)
1764 outputmask = rowmasks.view(mdtype)
1765 # Case #2. We have a basic dtype
1766 else:
1767 # We used some user-defined converters
1768 if user_converters:
1769 ishomogeneous = True
1770 descr = []
1771 for i, ttype in enumerate([conv.type for conv in converters]):
1772 # Keep the dtype of the current converter
1773 if i in user_converters:
1774 ishomogeneous &= (ttype == dtype.type)
1775 if ttype == np.string_:
1776 ttype = "|S%i" % max(len(row[i]) for row in data)
1777 descr.append(('', ttype))
1778 else:
1779 descr.append(('', dtype))
1780 # So we changed the dtype ?
1781 if not ishomogeneous:
1782 # We have more than one field
1783 if len(descr) > 1:
1784 dtype = np.dtype(descr)
1785 # We have only one field: drop the name if not needed.
1786 else:
1787 dtype = np.dtype(ttype)
1788 #
1789 output = np.array(data, dtype)
1790 if usemask:
1791 if dtype.names:
1792 mdtype = [(_, np.bool) for _ in dtype.names]
1793 else:
1794 mdtype = np.bool
1795 outputmask = np.array(masks, dtype=mdtype)
1796 # Try to take care of the missing data we missed
1797 names = output.dtype.names
1798 if usemask and names:
1799 for (name, conv) in zip(names or (), converters):
1800 missing_values = [conv(_) for _ in conv.missing_values
1801 if _ != asbytes('')]
1802 for mval in missing_values:
1803 outputmask[name] |= (output[name] == mval)
1804 # Construct the final array
1805 if usemask:
1806 output = output.view(MaskedArray)
1807 output._mask = outputmask
1808 if unpack:
1809 return output.squeeze().T
1810 return output.squeeze()
1811
1812
1813 def ndfromtxt(fname, **kwargs):
1814 """
1815 Load ASCII data stored in a file and return it as a single array.
1816
1817 Parameters
1818 ----------
1819 fname, kwargs : For a description of input parameters, see `genfromtxt`.
1820
1821 See Also
1822 --------
1823 numpy.genfromtxt : generic function.
1824
1825 """
1826 kwargs['usemask'] = False
1827 return genfromtxt(fname, **kwargs)
1828
1829
1830 def mafromtxt(fname, **kwargs):
1831 """
1832 Load ASCII data stored in a text file and return a masked array.
1833
1834 Parameters
1835 ----------
1836 fname, kwargs : For a description of input parameters, see `genfromtxt`.
1837
1838 See Also
1839 --------
1840 numpy.genfromtxt : generic function to load ASCII data.
1841
1842 """
1843 kwargs['usemask'] = True
1844 return genfromtxt(fname, **kwargs)
1845
1846
1847 def recfromtxt(fname, **kwargs):
1848 """
1849 Load ASCII data from a file and return it in a record array.
1850
1851 If ``usemask=False`` a standard `recarray` is returned,
1852 if ``usemask=True`` a MaskedRecords array is returned.
1853
1854 Parameters
1855 ----------
1856 fname, kwargs : For a description of input parameters, see `genfromtxt`.
1857
1858 See Also
1859 --------
1860 numpy.genfromtxt : generic function
1861
1862 Notes
1863 -----
1864 By default, `dtype` is None, which means that the data-type of the output
1865 array will be determined from the data.
1866
1867 """
1868 kwargs.setdefault("dtype", None)
1869 usemask = kwargs.get('usemask', False)
1870 output = genfromtxt(fname, **kwargs)
1871 if usemask:
1872 from numpy.ma.mrecords import MaskedRecords
1873 output = output.view(MaskedRecords)
1874 else:
1875 output = output.view(np.recarray)
1876 return output
1877
1878
1879 def recfromcsv(fname, **kwargs):
1880 """
1881 Load ASCII data stored in a comma-separated file.
1882
1883 The returned array is a record array (if ``usemask=False``, see
1884 `recarray`) or a masked record array (if ``usemask=True``,
1885 see `ma.mrecords.MaskedRecords`).
1886
1887 Parameters
1888 ----------
1889 fname, kwargs : For a description of input parameters, see `genfromtxt`.
1890
1891 See Also
1892 --------
1893 numpy.genfromtxt : generic function to load ASCII data.
1894
1895 Notes
1896 -----
1897 By default, `dtype` is None, which means that the data-type of the output
1898 array will be determined from the data.
1899
1900 """
1901 # Set default kwargs for genfromtxt as relevant to csv import.
1902 kwargs.setdefault("case_sensitive", "lower")
1903 kwargs.setdefault("names", True)
1904 kwargs.setdefault("delimiter", ",")
1905 kwargs.setdefault("dtype", None)
1906 output = genfromtxt(fname, **kwargs)
1907
1908 usemask = kwargs.get("usemask", False)
1909 if usemask:
1910 from numpy.ma.mrecords import MaskedRecords
1911 output = output.view(MaskedRecords)
1912 else:
1913 output = output.view(np.recarray)
1914 return output