Coverage for /var/devmt/py/utils4_1.6.0/utils4/utils.py: 100%
120 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-11-13 13:28 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-11-13 13:28 +0000
1# -*- coding: utf-8 -*-
2"""
3:Purpose: Central library for general utility-based methods.
5 This ``utils`` module was the starting place of the original
6 ``utils`` library. Therefore, it's historically been a
7 'dumping-ground' for general S3DEV utilities and function
8 wrappers specialised to the needs of S3DEV projects, which
9 did not seem to fit in anywhere else. So we'll be honest,
10 it's a bit of a melting pot of functions.
12 With the overhaul of the ``utils3`` library into ``utils4``,
13 *many* of the original functions, which were no longer being
14 used, have been removed in an effort to clean the module's
15 code base.
17 If you are looking for a function which used to be here,
18 please refer to the last ``utils3`` release, which is
19 v0.15.1.
21:Platform: Linux/Windows | Python 3.7+
22:Developer: J Berendt
23:Email: support@s3dev.uk
25Note:
26 Any libraries which are not built-in, are imported *only*
27 if/when the function which uses them is called.
29 This helps to reduce the packages required by ``utils4``.
31:Example:
33 For usage examples, please refer to the docstring for each method.
35"""
36# pylint: disable=import-error
37# pylint: disable=import-outside-toplevel # Keep required dependencies to a minimum.
38# pylint: disable=wrong-import-order
40import gzip
41import importlib
42import io
43import os
44import pandas as pd
45import platform
46import re
47import site
48import string
49import subprocess
50from datetime import datetime
51from typing import Generator, Union
52from utils4.reporterror import reporterror
53from utils4.user_interface import ui
55# OS-dependent imports
56try: # pragma: nocover
57 import win32api
58 import win32file
59except ImportError:
60 pass
63def clean_dataframe(df: pd.DataFrame):
64 """Clean a ``pandas.DataFrame`` data structure.
66 Args:
67 df (pd.DataFrame): DataFrame to be cleaned.
69 :Design:
70 The DataFrame is cleaned *in-place*. An object is *not* returned by
71 this function.
73 The following cleaning tasks are performed:
75 - Column names:
77 - All punctuation characters are removed, with the exception
78 of three characters. See next bullet point.
79 - The ``-``, ``[space]`` and ``_`` characters are replaced
80 with an underscore.
81 - All column names are converted to lower case.
83 - Data:
85 - All ``object`` (string) fields, are stripped of leading and
86 trailing whitespace.
88 :Example:
90 Example for cleaning a DataFrame::
92 >>> import pandas as pd # For demonstration only.
93 >>> from utils4 import utils
95 >>> # Define a dirty testing dataset.
96 >>> df = pd.DataFrame({'Column #1': [' Text field 1.',
97 ' Text field 2.',
98 ' Text field 3. ',
99 ' Text field 4. ',
100 ' Text field 5. '],
101 ' COLUmn (2)': [1.0,
102 2.0,
103 3.0,
104 '4',
105 '5.0'],
106 'COLUMN 3 ': [1,
107 2,
108 3.0,
109 4,
110 5.0]})
111 >>> utils.clean_dataframe(df)
112 >>> df
113 column_1 column_2 column_3
114 0 Text field 1. 1.0 1.0
115 1 Text field 2. 2.0 2.0
116 2 Text field 3. 3.0 3.0
117 3 Text field 4. 4 4.0
118 4 Text field 5. 5.0 5.0
120 """
121 # Define replacement/translation characters.
122 repls = {k: '' for k in string.punctuation}
123 repls.update({'-':'_', '_': '_', ' ': '_'})
124 trans = str.maketrans(repls)
125 # Clean column names.
126 df.columns = [c.strip().lower().translate(trans) for c in df.columns]
127 # Strip whitespace from text values.
128 for col in df:
129 if df[col].dtype == object:
130 df[col] = df[col].astype(str).str.strip()
132def direxists(path: str, create_path: bool=False) -> bool:
133 """Test if a directory exists. If not, create it, if instructed.
135 Args:
136 path (str): The directory path to be tested.
137 create_path (bool, optional): Create the path if it doesn't exist.
138 Defaults to False.
140 :Design:
141 Function designed to test if a directory path exists. If the
142 path does *not* exist, the path can be created; as determined by
143 the ``create_path`` parameter.
145 This function extends the built-in :func:`os.path.exists()` function
146 in that the path can be created if it doesn't already exist, by
147 passing the ``create_path`` parameter as ``True``.
149 If the path is created by this function, the function is recursively
150 called to test if the path exists, and will return ``True``.
152 If a filename is passed with the path, the filename is automatically
153 stripped from the path before the test begins.
155 :Example:
157 Test if a directory exists, and create it if it does not exist::
159 >>> from utils4 import utils
161 >>> utils.direxists(path='/tmp/path/to_create/file.csv',
162 create_path=True)
164 Returns:
165 bool: True if the directory exists (or was created), otherwise False.
167 """
168 found = False
169 if os.path.splitext(path)[1]:
170 path, _ = os.path.split(path) # Remove file if passed with the path.
171 if os.path.exists(path):
172 found = True
173 else:
174 if create_path:
175 os.makedirs(name=path)
176 found = direxists(path=path, create_path=False)
177 return found
179def fileexists(filepath: str, error: str='ignore') -> bool:
180 """Test if a file exists. If not, notify the user or raise an error.
182 Args:
183 filepath (str): Full file path to test.
184 error (bool, optional): Action to be taken if the file does not exist.
185 Defaults to 'ignore'. Options:
187 - ``'ignore'``: Take no action.
188 - ``'alert'``: Alert the user the filepath does not exist via
189 a simple message to the terminal.
190 - ``'raise'``: Raise a ``FileNotFoundError``. This will abort
191 all subsequent processing.
193 :Design:
194 Function designed check if a file exists. A boolean value is
195 returned to the calling program.
197 This function extends the built-in :func:`os.path.isfile` function
198 in that the user can be notified if the path does not exist, or an
199 error can be raised.
201 :Example:
203 Test if a file exists, using ``'ignore'``, the default action::
205 >>> from utils4 import utils
207 >>> if utils.fileexists(filepath='/tmp/path/to/file.csv'):
208 >>> ...
209 >>> else:
210 >>> ...
213 Test if a file exists, using ``'alert'``::
215 >>> from utils4 import utils
217 >>> if utils.fileexists(filepath='/tmp/path/to/file.csv',
218 error='alert'):
219 >>> ...
220 >>> else:
221 >>> ...
223 File not found: /tmp/path/to/file.csv
226 Test if a file exists, using ``'raise'``::
228 >>> from utils4 import utils
230 >>> if utils.fileexists(filepath='/tmp/path/to/file.csv',
231 error='raise'):
232 >>> ...
233 >>> else:
234 >>> ...
236 FileNotFoundError: File not found: /tmp/path/to/file.csv
238 Raises:
239 FileNotFoundError: If the filepath does not exist and the ``error``
240 parameter is ``'raise'``.
242 Returns:
243 bool: True if the file exists, otherwise False.
245 """
246 found = False
247 if os.path.isfile(filepath):
248 found = True
249 else:
250 if error == 'alert':
251 ui.print_warning(f'\nFile not found: {filepath}')
252 elif error == 'raise':
253 raise FileNotFoundError(f'File not found: {filepath}')
254 return found
257def format_exif_date(datestring: str,
258 input_format: str='%Y:%m:%d %H:%M:%S',
259 output_format: str='%Y%m%d%H%M%S',
260 return_datetime: bool=False) -> Union[datetime, str]:
261 """Format an exif timestamp.
263 This function is useful for storing an exif date as a datetime string.
264 For example, extracting the exif data from an image to be stored into
265 a database.
267 Args:
268 datestring (str): The datetime string to be formatted.
269 A typical exif date format is: yyyy:mm:dd hh:mi:ss
270 input_format (str, optional): Format mask for the input datetime value.
271 Defaults to '%Y:%m:%d %H:%M:%S'.
272 output_format (str, optional): Format mask for the output datetime,
273 if returned as a string. Defaults to '%Y%m%d%H%M%S'.
274 return_datetime (bool, optional): Return a ``datetime`` object, rather
275 than a formatted string.
277 :Design:
278 Function designed to convert the exif date/timestamp from
279 '2010:01:31 12:31:18' (or a caller specified format) to a format
280 specified by the caller.
282 The default input mask is the standard exif capture datetime format.
284 :Example:
286 Convert the exif datetime to the default output string format::
288 >>> from utils4 import utils
290 >>> formatted = utils.format_exif_date('2010:01:31 12:31:18')
291 >>> formatted
292 '20100131123118'
295 Convert the exif datetime to a datetime object::
297 >>> from utils4 import utils
299 >>> formatted = utils.format_exif_date('2010:01:31 12:31:18',
300 return_datetime=True)
301 >>> formatted
302 datetime.datetime(2010, 1, 31, 12, 31, 18)
305 Returns:
306 Union[str, datetime.datetime]: A formatted datetime string, if the
307 ``return_datetime`` parameter is ``False``, otherwise a
308 ``datetime.datetime`` object.
310 """
311 # pylint: disable=no-else-return
312 _dt = datetime.strptime(datestring, input_format)
313 if return_datetime:
314 return _dt
315 else:
316 return _dt.strftime(output_format)
318def get_os() -> str:
319 """Get the platform's OS.
321 This method is a very thin wrapper around the :func:`platform.system()`
322 function.
324 :Example:
325 ::
327 >>> from utils4 import utils
329 >>> utils.get_os()
330 'linux'
332 Returns:
333 str: A string of the platform's operating system, in lower case.
335 """
336 return platform.system().lower()
338def get_removable_drives() -> Generator[str, str, str]:
339 """Return a generator of removable drives.
341 Note:
342 A removable drive is identified by the constant 2, which is the
343 value of the enum ``win32con.DRIVE_REMOVABLE``.
345 Raises:
346 NotImplementedError: Raised if the OS is not Windows.
348 Yields:
349 Generator[str]: Each removable drive letter as a
350 string. For example: ``E:``
352 """
353 if get_os() == 'windows': # pragma: nocover
354 yield from filter(lambda x: win32file.GetDriveType(x) == 2,
355 win32api.GetLogicalDriveStrings().split('\\\x00'))
356 else:
357 raise NotImplementedError('This function is Windows-only.')
359def getdrivername(driver: str, return_all: bool=False) -> list: # pragma: nocover
360 """Return a list of ODBC driver names, matching the regex pattern.
362 Args:
363 driver (str): A **regex pattern** for the ODBC driver you're searching.
364 return_all (bool, optional): If True, *all* drivers matching the
365 pattern are returned. Defaults to False, which returns only the
366 first driver name.
368 :Design:
369 This is a helper function designed to get and return the names
370 of ODBC drivers.
372 The ``driver`` parameter should be formatted as a regex
373 pattern. If multiple drivers are found, by default, only the
374 first driver in the list is returned. However, the
375 ``return_all`` parameter adjusts this action to return all driver
376 names.
378 This function has a dependency on the ``pyodbc`` library. Therefore,
379 the :func:`~utils.testimport()` function is called before ``pyodbc``
380 is imported. If the ``pyodbc`` library is not installed, the user is
381 notified.
383 :Dependencies:
384 - ``pyodbc`` library
386 :Example:
388 Get the driver name for the SQL Server ODBC driver::
390 >>> from utils4 import utils
391 >>> driver = utils.getdrivername(driver='SQL Server.*')
393 :Troubleshooting:
395 - On Unix-like systems, the following error message::
397 ImportError: libodbc.so.2: cannot open shared object file: No such file or directory
399 can be resolved by installing the ``unixodbc-dev`` package as::
401 $ sudo apt install unixodbc-dev
403 Returns:
404 list: A list of ODBC drivers, if any were found.
406 """
407 drivers = []
408 if testimport('pyodbc', verbose=True):
409 import pyodbc
410 drivers = [i for i in pyodbc.drivers() if re.search(driver, i)]
411 if not return_all and drivers:
412 drivers = drivers[0]
413 return drivers
415def getsitepackages() -> str:
416 """Return the Python installation's site packages directory.
418 :Design:
419 The function first uses the local :func:`~utils.get_os()`
420 function to get the system's OS. The OS is then tested and the
421 site-packages location is returned using the OS-appropriate element
422 from the list returned by the built-in :func:`site.getsitepackages`
423 function.
425 If the OS is not accounted for, or fails the test, a value of
426 'unknown' is returned.
428 :Rationale:
429 The need for this function comes out of the observation there are many
430 (many!) different ways on stackoverflow (and other sites) to get the
431 location to which ``pip`` will install a package, and many of the
432 answers contradict each other. Also, the :func:`site.getsitepackages`
433 function returns a list of options (in all tested cases); and the
434 Linux / Windows paths are in different locations in this list.
436 :Example:
438 Get the location of the ``site-packages`` directory::
440 >>> from utils4 import utils
442 >>> utils.getsitepackages()
443 '/home/<username>/venvs/py38/lib/python3.8/site-packages'
445 Returns:
446 str: Full path to the ``site-packages`` directory.
448 """
449 _os = get_os()
450 pkgs = 'unknown'
451 if 'win' in _os: # pragma: nocover # utils4 will *rarely* ever be tested on Windows.
452 pkgs = site.getsitepackages()[1]
453 elif 'lin' in _os:
454 pkgs = site.getsitepackages()[0]
455 return pkgs
457def gzip_compress(in_path: str, out_path: str=None, size: int=None) -> str:
458 """Compress a file using ``gzip``.
460 Args:
461 in_path (str): Full path to the file to be compressed. If the file
462 does not exist, a ``FileNotFoundError`` is raised.
463 out_path (str, optional): Full path to the compressed output file.
464 Defaults to None. If this value is ``None`` a ``'.gz'`` file
465 extension is appended to the path provided to the ``in_path``
466 parameter.
467 size (int, optional): Size of the chunk to be read / written during
468 compression. Defaults to 10MiB.
470 :Example:
472 Compress a text file::
474 >>> from utils4 import utils
476 >>> utils.gzip_compress(in_path='/tmp/rand.txt')
477 '/tmp/rand.txt.gz'
480 Compress a text file, specifying the output path::
482 >>> from utils4 import utils
484 >>> utils.gzip_compress(in_path='/tmp/rand.txt', out_path='/tmp/rand2.txt.gz')
485 '/tmp/rand2.txt.gz'
487 Returns:
488 str: Full path to the output file.
490 """
491 size = 1024*1024*10 if size is None else size # Default to 10MiB.
492 if fileexists(filepath=in_path, error='raise'):
493 if out_path is None:
494 out_path = f'{in_path}.gz'
495 with open(in_path, 'rb') as f_in, open(out_path, 'wb') as f_out:
496 chunk = f_in.read(size)
497 while len(chunk) > 0:
498 comp = gzip.compress(data=chunk, compresslevel=9)
499 f_out.write(comp)
500 chunk = f_in.read(size)
501 return out_path
503def gzip_decompress(path: str, encoding: str='utf-8', size: int=None) -> bool:
504 """Decompress a ``.gz`` file using ``gzip``.
506 Args:
507 path (str): Full path to the file to be decompressed. If the file
508 does not exist, a ``FileNotFoundError`` is raised.
509 encoding (str, optional): Encoding to be used to decode the
510 decompressed binary data. Defaults to 'utf-8'.
511 size (int, optional): Size of the chunk to be read / written during
512 decompression. Defaults to 1MiB.
514 Note:
515 The output path is simply the ``path`` value with *last* file
516 extension removed.
518 In general cases, a file compressed using gzip will have a ``.gz``
519 extension appended onto the existing filename and extension.
520 For example: ``data.txt.gz``.
522 Note:
523 **Newline Characters:**
525 When the decompressed file is written, the ``newline`` character is
526 specified as ``''``, which enables 'universal newline mode', whereby
527 the system's newline character is used. However, the *original* line
528 endings - those used in the compressed file - are written back to the
529 decompressed file.
531 This method is used to ensure the checksum hash on the original
532 (unzipped) and decompressed file can be compared.
534 :Example:
536 Decompress a text file::
538 >>> from utils4 import utils
540 >>> utils.gzip_decompress(path='/tmp/rand.txt.gz')
541 True
543 Returns:
544 bool: True if the decompression was successful, otherwise False.
546 """
547 # pylint: disable=line-too-long
548 size = (1<<2)**10 if size is None else size # Default to 1 MiB.
549 success = False
550 try:
551 if fileexists(filepath=path, error='raise'):
552 out_path = os.path.splitext(path)[0]
553 with open(path, 'rb') as f_in, open(out_path, 'w', encoding='utf-8', newline='') as f_out:
554 chunk = f_in.read(size)
555 while len(chunk) > 1:
556 decomp = gzip.decompress(data=chunk).decode(encoding=encoding)
557 f_out.write(decomp)
558 chunk = f_in.read(size)
559 success = True
560 except Exception as err:
561 reporterror(err)
562 return success
564def ping(server: str, count: int=1, timeout: int=5, verbose: bool=False) -> bool:
565 r"""Ping an IP address, server or web address.
567 Args:
568 server (str): IP address, server name or web address.
569 count (int, optional): The number of ping attempts. Defaults to 1.
570 timeout (int, optional): Number of seconds to wait for response.
571 Defaults to 5.
572 verbose (bool, optional): Display all stdout and/or stderr output, if
573 the returned status code is non-zero. Defaults to False.
575 :Design:
576 Using the platform's native ``ping`` command (via a ``subprocess``
577 call) the host is pinged, and a boolean value is returned to the
578 caller to indicate if the ping was successful.
580 A ping status:
582 - 0 returns True
583 - Non-zero returns False
585 If the server name is preceeded by ``\\`` or ``//``, these are
586 stripped out using the built-in :func:`os.path.basename()` function.
588 :Example:
590 Ping the local PC at 127.0.0.1::
592 >>> from utils4 import utils
594 >>> utils.ping(server='127.0.0.1')
595 True
598 Ping an unknown server::
600 >>> from utils4 import utils
602 >>> utils.ping(server='//S3DHOST01', verbose=True)
604 [PingError]:
605 ping: S3DHOST01: Temporary failure in name resolution
606 False
609 Ping an unreachable IP address::
611 >>> from utils4 import utils
613 >>> utils.ping(server='192.168.0.99', count=3, verbose=True)
615 [PingError]:
616 PING 192.168.0.99 (192.168.0.99) 56(84) bytes of data.
617 From 192.168.0.XX icmp_seq=1 Destination Host Unreachable
618 From 192.168.0.XX icmp_seq=2 Destination Host Unreachable
619 From 192.168.0.XX icmp_seq=3 Destination Host Unreachable
621 --- 192.168.0.99 ping statistics ---
622 3 packets transmitted, 0 received, +3 errors, 100% packet loss, time 2037ms
623 False
625 Returns:
626 bool: True if the ping was successful, otherwise False.
628 """
629 cmd = []
630 server = os.path.basename(server)
631 status = 1
632 stdout = None
633 stderr = None
634 _os = get_os()
635 if 'win' in _os: # pragma: nocover # utils4 will *rarely* ever be tested on Windows.
636 timeout *= 1000 # Windows timeout (-w) is in milliseconds.
637 cmd = ['ping', '-n', str(count), '-w', str(timeout), server]
638 elif 'lin' in _os:
639 cmd = ['ping', f'-c{count}', f'-W{timeout}', server]
640 else: # pragma: nocover
641 ui.print_alert('\nProcess aborted, unsupported OS.\n'
642 f'- OS identified as: {_os}\n')
643 if cmd:
644 with subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc:
645 stdout, stderr = proc.communicate()
646 status = proc.returncode
647 if ('win' in _os) & (b'Destination host unreachable' in stdout): # pragma nocover
648 # Hard code status if host is unreachable.
649 # Generally, this will return 0, so it must be overridden.
650 status = 1
651 if all([verbose, cmd, status != 0]):
652 ui.print_alert('\n[PingError]:')
653 if stdout:
654 ui.print_alert(text=stdout.decode().strip())
655 if stderr:
656 ui.print_alert(text=stderr.decode().strip())
657 return status == 0
659def strip_ansi_colour(text: str):
660 """Strip ANSI colour sequences from a string.
662 Args:
663 text (str): Text string to be stripped.
665 Note:
666 This method is *very* basic and only caters to colour sequences.
668 It is designed to yield all characters that are not part of the
669 ``\x1b`` sequence start, and the ``m`` sequence end. In other
670 words, all text before and after each ``\x1b[M;Nm`` sequence.
672 :Example:
674 Strip the colouring sequence from terminal text and return a
675 single string::
677 clean = ''.join(strip_ansi_colour(text))
679 Strip the colouring sequence from terminal text and return a list
680 of lines, with empty lines removed::
682 lines = list(filter(None, ''.join(strip_ansi_colour(text)).split('\n')))
684 Yields:
685 str: Each character which not part of the ANSI escape sequence
686 is yielded to the caller. Essentially, this is a generator
687 method.
689 """
690 # pylint: disable=multiple-statements
691 buff = io.StringIO(text)
692 while (b := buff.read(1)):
693 if b == '\x1b':
694 while ( b := buff.read(1) ) != 'm': continue # Fast-forward from \x1b to m.
695 else:
696 yield b
698def testimport(module_name: str, verbose: bool=True) -> bool:
699 """Test if a Python library is installed.
701 Args:
702 module_name (str): Exact name of the module to be found.
703 verbose (bool, optional): Notify if the library is not installed.
704 Defaults to True.
706 :Design:
707 This is a small helper function designed to test if a library is
708 installed before trying to import it.
710 If the library is not intalled the user is notified, if the ``verbose``
711 argument is True.
713 :Internal Use:
714 For example, the :meth:`~utils.getdrivername` function uses this
715 function before attempting to import the ``pyodbc`` library.
717 :Example:
719 Execute a path only if ``mymodule`` is installed::
721 >>> from utils4 import utils
723 >>> if utils.testimport('mymodule', verbose=True):
724 >>> import mymodule
725 >>> ...
726 >>> else:
727 >>> ...
729 Returns:
730 bool: True if the library is installed, otherwise False.
732 """
733 found = False
734 if importlib.util.find_spec(module_name):
735 found = True
736 if (verbose) & (not found):
737 ui.print_warning(f'\nLibrary/module not installed: {module_name}')
738 return found
740def unidecode(string: str, **kwargs) -> str:
741 """Attempt to convert a Unicode string object into a 7-bit ASCII string.
743 Args:
744 string (str): The string to be decoded.
745 **kwargs (dict): Keyword arguments passed directly into the underlying
746 :func:`unidecode.unidecode` function.
748 :Design:
749 This function is a light wrapper around the :func:`unidecode.unidecode`
750 function.
752 **Per the** ``unicode`` **docstring:**
754 "Transliterate an Unicode object into an ASCII string."
756 Example::
758 >>> unidecode(u"北亰")
759 "Bei Jing "
761 "This function first tries to convert the string using ASCII codec.
762 If it fails (because of non-ASCII characters), it falls back to
763 transliteration using the character tables."
765 "This is approx. five times faster if the string only contains ASCII
766 characters, but slightly slower than
767 :func:`unidecode.unicode_expect_nonascii` if non-ASCII characters are
768 present."
770 :Dependencies:
772 - ``unidecode`` library
774 :Example:
776 Convert a Polish address into pure ASCII::
778 >>> from utils4 import utils
780 >>> addr = 'ul. Bałtów 8a 27-423 Bałtów, woj. świętokrzyskie'
781 >>> utils.unidecode(addr)
782 'ul. Baltow 8a 27-423 Baltow, woj. swietokrzyskie'
785 Convert the first line of 'The Seventh Letter', by Plato::
787 >>> from utils4 import utils
789 >>> text = 'Πλάτων τοῖς Δίωνος οἰκείοις τε καὶ ἑταίροις εὖ πράττειν.'
790 >>> utils.unidecode(text)
791 'Platon tois Dionos oikeiois te kai etairois eu prattein.'
793 Returns:
794 str: If the ``unidecode`` library is installed and the passed
795 ``string`` value is a ``str`` data type, the decoded string is
796 returned, otherwise the original value is returned.
798 """
799 # pylint: disable=redefined-outer-name # No adverse effects and keeps clear variable name.
800 if testimport(module_name='unidecode', verbose=True):
801 import unidecode as unidecode_
802 decoded = unidecode_.unidecode(string, **kwargs) if isinstance(string, str) else string
803 else: # pragma: nocover
804 decoded = string
805 return decoded