Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/numpy/lib/_datasource.py : 22%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""A file interface for handling local and remote data files.
3The goal of datasource is to abstract some of the file system operations
4when dealing with data files so the researcher doesn't have to know all the
5low-level details. Through datasource, a researcher can obtain and use a
6file with one function call, regardless of location of the file.
8DataSource is meant to augment standard python libraries, not replace them.
9It should work seamlessly with standard file IO operations and the os
10module.
12DataSource files can originate locally or remotely:
14- local files : '/home/guido/src/local/data.txt'
15- URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt'
17DataSource files can also be compressed or uncompressed. Currently only
18gzip, bz2 and xz are supported.
20Example::
22 >>> # Create a DataSource, use os.curdir (default) for local storage.
23 >>> from numpy import DataSource
24 >>> ds = DataSource()
25 >>>
26 >>> # Open a remote file.
27 >>> # DataSource downloads the file, stores it locally in:
28 >>> # './www.google.com/index.html'
29 >>> # opens the file and returns a file object.
30 >>> fp = ds.open('http://www.google.com/') # doctest: +SKIP
31 >>>
32 >>> # Use the file as you normally would
33 >>> fp.read() # doctest: +SKIP
34 >>> fp.close() # doctest: +SKIP
36"""
37import os
38import shutil
39import io
40from contextlib import closing
42from numpy.core.overrides import set_module
45_open = open
48def _check_mode(mode, encoding, newline):
49 """Check mode and that encoding and newline are compatible.
51 Parameters
52 ----------
53 mode : str
54 File open mode.
55 encoding : str
56 File encoding.
57 newline : str
58 Newline for text files.
60 """
61 if "t" in mode:
62 if "b" in mode:
63 raise ValueError("Invalid mode: %r" % (mode,))
64 else:
65 if encoding is not None:
66 raise ValueError("Argument 'encoding' not supported in binary mode")
67 if newline is not None:
68 raise ValueError("Argument 'newline' not supported in binary mode")
71# Using a class instead of a module-level dictionary
72# to reduce the initial 'import numpy' overhead by
73# deferring the import of lzma, bz2 and gzip until needed
75# TODO: .zip support, .tar support?
76class _FileOpeners:
77 """
78 Container for different methods to open (un-)compressed files.
80 `_FileOpeners` contains a dictionary that holds one method for each
81 supported file format. Attribute lookup is implemented in such a way
82 that an instance of `_FileOpeners` itself can be indexed with the keys
83 of that dictionary. Currently uncompressed files as well as files
84 compressed with ``gzip``, ``bz2`` or ``xz`` compression are supported.
86 Notes
87 -----
88 `_file_openers`, an instance of `_FileOpeners`, is made available for
89 use in the `_datasource` module.
91 Examples
92 --------
93 >>> import gzip
94 >>> np.lib._datasource._file_openers.keys()
95 [None, '.bz2', '.gz', '.xz', '.lzma']
96 >>> np.lib._datasource._file_openers['.gz'] is gzip.open
97 True
99 """
101 def __init__(self):
102 self._loaded = False
103 self._file_openers = {None: io.open}
105 def _load(self):
106 if self._loaded:
107 return
109 try:
110 import bz2
111 self._file_openers[".bz2"] = bz2.open
112 except ImportError:
113 pass
115 try:
116 import gzip
117 self._file_openers[".gz"] = gzip.open
118 except ImportError:
119 pass
121 try:
122 import lzma
123 self._file_openers[".xz"] = lzma.open
124 self._file_openers[".lzma"] = lzma.open
125 except (ImportError, AttributeError):
126 # There are incompatible backports of lzma that do not have the
127 # lzma.open attribute, so catch that as well as ImportError.
128 pass
130 self._loaded = True
132 def keys(self):
133 """
134 Return the keys of currently supported file openers.
136 Parameters
137 ----------
138 None
140 Returns
141 -------
142 keys : list
143 The keys are None for uncompressed files and the file extension
144 strings (i.e. ``'.gz'``, ``'.xz'``) for supported compression
145 methods.
147 """
148 self._load()
149 return list(self._file_openers.keys())
151 def __getitem__(self, key):
152 self._load()
153 return self._file_openers[key]
155_file_openers = _FileOpeners()
157def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None):
158 """
159 Open `path` with `mode` and return the file object.
161 If ``path`` is an URL, it will be downloaded, stored in the
162 `DataSource` `destpath` directory and opened from there.
164 Parameters
165 ----------
166 path : str
167 Local file path or URL to open.
168 mode : str, optional
169 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to
170 append. Available modes depend on the type of object specified by
171 path. Default is 'r'.
172 destpath : str, optional
173 Path to the directory where the source file gets downloaded to for
174 use. If `destpath` is None, a temporary directory will be created.
175 The default path is the current directory.
176 encoding : {None, str}, optional
177 Open text file with given encoding. The default encoding will be
178 what `io.open` uses.
179 newline : {None, str}, optional
180 Newline to use when reading text file.
182 Returns
183 -------
184 out : file object
185 The opened file.
187 Notes
188 -----
189 This is a convenience function that instantiates a `DataSource` and
190 returns the file object from ``DataSource.open(path)``.
192 """
194 ds = DataSource(destpath)
195 return ds.open(path, mode, encoding=encoding, newline=newline)
198@set_module('numpy')
199class DataSource:
200 """
201 DataSource(destpath='.')
203 A generic data source file (file, http, ftp, ...).
205 DataSources can be local files or remote files/URLs. The files may
206 also be compressed or uncompressed. DataSource hides some of the
207 low-level details of downloading the file, allowing you to simply pass
208 in a valid file path (or URL) and obtain a file object.
210 Parameters
211 ----------
212 destpath : str or None, optional
213 Path to the directory where the source file gets downloaded to for
214 use. If `destpath` is None, a temporary directory will be created.
215 The default path is the current directory.
217 Notes
218 -----
219 URLs require a scheme string (``http://``) to be used, without it they
220 will fail::
222 >>> repos = np.DataSource()
223 >>> repos.exists('www.google.com/index.html')
224 False
225 >>> repos.exists('http://www.google.com/index.html')
226 True
228 Temporary directories are deleted when the DataSource is deleted.
230 Examples
231 --------
232 ::
234 >>> ds = np.DataSource('/home/guido')
235 >>> urlname = 'http://www.google.com/'
236 >>> gfile = ds.open('http://www.google.com/')
237 >>> ds.abspath(urlname)
238 '/home/guido/www.google.com/index.html'
240 >>> ds = np.DataSource(None) # use with temporary file
241 >>> ds.open('/home/guido/foobar.txt')
242 <open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430>
243 >>> ds.abspath('/home/guido/foobar.txt')
244 '/tmp/.../home/guido/foobar.txt'
246 """
248 def __init__(self, destpath=os.curdir):
249 """Create a DataSource with a local path at destpath."""
250 if destpath:
251 self._destpath = os.path.abspath(destpath)
252 self._istmpdest = False
253 else:
254 import tempfile # deferring import to improve startup time
255 self._destpath = tempfile.mkdtemp()
256 self._istmpdest = True
258 def __del__(self):
259 # Remove temp directories
260 if hasattr(self, '_istmpdest') and self._istmpdest:
261 shutil.rmtree(self._destpath)
263 def _iszip(self, filename):
264 """Test if the filename is a zip file by looking at the file extension.
266 """
267 fname, ext = os.path.splitext(filename)
268 return ext in _file_openers.keys()
270 def _iswritemode(self, mode):
271 """Test if the given mode will open a file for writing."""
273 # Currently only used to test the bz2 files.
274 _writemodes = ("w", "+")
275 for c in mode:
276 if c in _writemodes:
277 return True
278 return False
280 def _splitzipext(self, filename):
281 """Split zip extension from filename and return filename.
283 *Returns*:
284 base, zip_ext : {tuple}
286 """
288 if self._iszip(filename):
289 return os.path.splitext(filename)
290 else:
291 return filename, None
293 def _possible_names(self, filename):
294 """Return a tuple containing compressed filename variations."""
295 names = [filename]
296 if not self._iszip(filename):
297 for zipext in _file_openers.keys():
298 if zipext:
299 names.append(filename+zipext)
300 return names
302 def _isurl(self, path):
303 """Test if path is a net location. Tests the scheme and netloc."""
305 # We do this here to reduce the 'import numpy' initial import time.
306 from urllib.parse import urlparse
308 # BUG : URLs require a scheme string ('http://') to be used.
309 # www.google.com will fail.
310 # Should we prepend the scheme for those that don't have it and
311 # test that also? Similar to the way we append .gz and test for
312 # for compressed versions of files.
314 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
315 return bool(scheme and netloc)
317 def _cache(self, path):
318 """Cache the file specified by path.
320 Creates a copy of the file in the datasource cache.
322 """
323 # We import these here because importing urllib is slow and
324 # a significant fraction of numpy's total import time.
325 from urllib.request import urlopen
326 from urllib.error import URLError
328 upath = self.abspath(path)
330 # ensure directory exists
331 if not os.path.exists(os.path.dirname(upath)):
332 os.makedirs(os.path.dirname(upath))
334 # TODO: Doesn't handle compressed files!
335 if self._isurl(path):
336 try:
337 with closing(urlopen(path)) as openedurl:
338 with _open(upath, 'wb') as f:
339 shutil.copyfileobj(openedurl, f)
340 except URLError:
341 raise URLError("URL not found: %s" % path)
342 else:
343 shutil.copyfile(path, upath)
344 return upath
346 def _findfile(self, path):
347 """Searches for ``path`` and returns full path if found.
349 If path is an URL, _findfile will cache a local copy and return the
350 path to the cached file. If path is a local file, _findfile will
351 return a path to that local file.
353 The search will include possible compressed versions of the file
354 and return the first occurrence found.
356 """
358 # Build list of possible local file paths
359 if not self._isurl(path):
360 # Valid local paths
361 filelist = self._possible_names(path)
362 # Paths in self._destpath
363 filelist += self._possible_names(self.abspath(path))
364 else:
365 # Cached URLs in self._destpath
366 filelist = self._possible_names(self.abspath(path))
367 # Remote URLs
368 filelist = filelist + self._possible_names(path)
370 for name in filelist:
371 if self.exists(name):
372 if self._isurl(name):
373 name = self._cache(name)
374 return name
375 return None
377 def abspath(self, path):
378 """
379 Return absolute path of file in the DataSource directory.
381 If `path` is an URL, then `abspath` will return either the location
382 the file exists locally or the location it would exist when opened
383 using the `open` method.
385 Parameters
386 ----------
387 path : str
388 Can be a local file or a remote URL.
390 Returns
391 -------
392 out : str
393 Complete path, including the `DataSource` destination directory.
395 Notes
396 -----
397 The functionality is based on `os.path.abspath`.
399 """
400 # We do this here to reduce the 'import numpy' initial import time.
401 from urllib.parse import urlparse
403 # TODO: This should be more robust. Handles case where path includes
404 # the destpath, but not other sub-paths. Failing case:
405 # path = /home/guido/datafile.txt
406 # destpath = /home/alex/
407 # upath = self.abspath(path)
408 # upath == '/home/alex/home/guido/datafile.txt'
410 # handle case where path includes self._destpath
411 splitpath = path.split(self._destpath, 2)
412 if len(splitpath) > 1:
413 path = splitpath[1]
414 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
415 netloc = self._sanitize_relative_path(netloc)
416 upath = self._sanitize_relative_path(upath)
417 return os.path.join(self._destpath, netloc, upath)
419 def _sanitize_relative_path(self, path):
420 """Return a sanitised relative path for which
421 os.path.abspath(os.path.join(base, path)).startswith(base)
422 """
423 last = None
424 path = os.path.normpath(path)
425 while path != last:
426 last = path
427 # Note: os.path.join treats '/' as os.sep on Windows
428 path = path.lstrip(os.sep).lstrip('/')
429 path = path.lstrip(os.pardir).lstrip('..')
430 drive, path = os.path.splitdrive(path) # for Windows
431 return path
433 def exists(self, path):
434 """
435 Test if path exists.
437 Test if `path` exists as (and in this order):
439 - a local file.
440 - a remote URL that has been downloaded and stored locally in the
441 `DataSource` directory.
442 - a remote URL that has not been downloaded, but is valid and
443 accessible.
445 Parameters
446 ----------
447 path : str
448 Can be a local file or a remote URL.
450 Returns
451 -------
452 out : bool
453 True if `path` exists.
455 Notes
456 -----
457 When `path` is an URL, `exists` will return True if it's either
458 stored locally in the `DataSource` directory, or is a valid remote
459 URL. `DataSource` does not discriminate between the two, the file
460 is accessible if it exists in either location.
462 """
464 # First test for local path
465 if os.path.exists(path):
466 return True
468 # We import this here because importing urllib is slow and
469 # a significant fraction of numpy's total import time.
470 from urllib.request import urlopen
471 from urllib.error import URLError
473 # Test cached url
474 upath = self.abspath(path)
475 if os.path.exists(upath):
476 return True
478 # Test remote url
479 if self._isurl(path):
480 try:
481 netfile = urlopen(path)
482 netfile.close()
483 del(netfile)
484 return True
485 except URLError:
486 return False
487 return False
489 def open(self, path, mode='r', encoding=None, newline=None):
490 """
491 Open and return file-like object.
493 If `path` is an URL, it will be downloaded, stored in the
494 `DataSource` directory and opened from there.
496 Parameters
497 ----------
498 path : str
499 Local file path or URL to open.
500 mode : {'r', 'w', 'a'}, optional
501 Mode to open `path`. Mode 'r' for reading, 'w' for writing,
502 'a' to append. Available modes depend on the type of object
503 specified by `path`. Default is 'r'.
504 encoding : {None, str}, optional
505 Open text file with given encoding. The default encoding will be
506 what `io.open` uses.
507 newline : {None, str}, optional
508 Newline to use when reading text file.
510 Returns
511 -------
512 out : file object
513 File object.
515 """
517 # TODO: There is no support for opening a file for writing which
518 # doesn't exist yet (creating a file). Should there be?
520 # TODO: Add a ``subdir`` parameter for specifying the subdirectory
521 # used to store URLs in self._destpath.
523 if self._isurl(path) and self._iswritemode(mode):
524 raise ValueError("URLs are not writeable")
526 # NOTE: _findfile will fail on a new file opened for writing.
527 found = self._findfile(path)
528 if found:
529 _fname, ext = self._splitzipext(found)
530 if ext == 'bz2':
531 mode.replace("+", "")
532 return _file_openers[ext](found, mode=mode,
533 encoding=encoding, newline=newline)
534 else:
535 raise IOError("%s not found." % path)
538class Repository (DataSource):
539 """
540 Repository(baseurl, destpath='.')
542 A data repository where multiple DataSource's share a base
543 URL/directory.
545 `Repository` extends `DataSource` by prepending a base URL (or
546 directory) to all the files it handles. Use `Repository` when you will
547 be working with multiple files from one base URL. Initialize
548 `Repository` with the base URL, then refer to each file by its filename
549 only.
551 Parameters
552 ----------
553 baseurl : str
554 Path to the local directory or remote location that contains the
555 data files.
556 destpath : str or None, optional
557 Path to the directory where the source file gets downloaded to for
558 use. If `destpath` is None, a temporary directory will be created.
559 The default path is the current directory.
561 Examples
562 --------
563 To analyze all files in the repository, do something like this
564 (note: this is not self-contained code)::
566 >>> repos = np.lib._datasource.Repository('/home/user/data/dir/')
567 >>> for filename in filelist:
568 ... fp = repos.open(filename)
569 ... fp.analyze()
570 ... fp.close()
572 Similarly you could use a URL for a repository::
574 >>> repos = np.lib._datasource.Repository('http://www.xyz.edu/data')
576 """
578 def __init__(self, baseurl, destpath=os.curdir):
579 """Create a Repository with a shared url or directory of baseurl."""
580 DataSource.__init__(self, destpath=destpath)
581 self._baseurl = baseurl
583 def __del__(self):
584 DataSource.__del__(self)
586 def _fullpath(self, path):
587 """Return complete path for path. Prepends baseurl if necessary."""
588 splitpath = path.split(self._baseurl, 2)
589 if len(splitpath) == 1:
590 result = os.path.join(self._baseurl, path)
591 else:
592 result = path # path contains baseurl already
593 return result
595 def _findfile(self, path):
596 """Extend DataSource method to prepend baseurl to ``path``."""
597 return DataSource._findfile(self, self._fullpath(path))
599 def abspath(self, path):
600 """
601 Return absolute path of file in the Repository directory.
603 If `path` is an URL, then `abspath` will return either the location
604 the file exists locally or the location it would exist when opened
605 using the `open` method.
607 Parameters
608 ----------
609 path : str
610 Can be a local file or a remote URL. This may, but does not
611 have to, include the `baseurl` with which the `Repository` was
612 initialized.
614 Returns
615 -------
616 out : str
617 Complete path, including the `DataSource` destination directory.
619 """
620 return DataSource.abspath(self, self._fullpath(path))
622 def exists(self, path):
623 """
624 Test if path exists prepending Repository base URL to path.
626 Test if `path` exists as (and in this order):
628 - a local file.
629 - a remote URL that has been downloaded and stored locally in the
630 `DataSource` directory.
631 - a remote URL that has not been downloaded, but is valid and
632 accessible.
634 Parameters
635 ----------
636 path : str
637 Can be a local file or a remote URL. This may, but does not
638 have to, include the `baseurl` with which the `Repository` was
639 initialized.
641 Returns
642 -------
643 out : bool
644 True if `path` exists.
646 Notes
647 -----
648 When `path` is an URL, `exists` will return True if it's either
649 stored locally in the `DataSource` directory, or is a valid remote
650 URL. `DataSource` does not discriminate between the two, the file
651 is accessible if it exists in either location.
653 """
654 return DataSource.exists(self, self._fullpath(path))
656 def open(self, path, mode='r', encoding=None, newline=None):
657 """
658 Open and return file-like object prepending Repository base URL.
660 If `path` is an URL, it will be downloaded, stored in the
661 DataSource directory and opened from there.
663 Parameters
664 ----------
665 path : str
666 Local file path or URL to open. This may, but does not have to,
667 include the `baseurl` with which the `Repository` was
668 initialized.
669 mode : {'r', 'w', 'a'}, optional
670 Mode to open `path`. Mode 'r' for reading, 'w' for writing,
671 'a' to append. Available modes depend on the type of object
672 specified by `path`. Default is 'r'.
673 encoding : {None, str}, optional
674 Open text file with given encoding. The default encoding will be
675 what `io.open` uses.
676 newline : {None, str}, optional
677 Newline to use when reading text file.
679 Returns
680 -------
681 out : file object
682 File object.
684 """
685 return DataSource.open(self, self._fullpath(path), mode,
686 encoding=encoding, newline=newline)
688 def listdir(self):
689 """
690 List files in the source Repository.
692 Returns
693 -------
694 files : list of str
695 List of file names (not containing a directory part).
697 Notes
698 -----
699 Does not currently work for remote repositories.
701 """
702 if self._isurl(self._baseurl):
703 raise NotImplementedError(
704 "Directory listing of URLs, not supported yet.")
705 else:
706 return os.listdir(self._baseurl)