Source code for skll.data.writers

# License: BSD 3 clause
"""
Handles loading data from various types of data files.

:author: Dan Blanchard (dblanchard@ets.org)
:author: Michael Heilman (mheilman@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)
:organization: ETS
"""

from __future__ import absolute_import, print_function, unicode_literals

import json
import logging
import os
import re
import sys
from csv import DictWriter
from decimal import Decimal
from io import open

import numpy as np
from six import iteritems, PY2, string_types, text_type
from six.moves import map
from sklearn.feature_extraction import FeatureHasher


[docs]class Writer(object): """ Helper class for writing out FeatureSets to files. :param path: A path to the feature file we would like to create. The suffix to this filename must be ``.arff``, ``.csv``, ``.jsonlines``, ``.libsvm``, ``.megam``, ``.ndj``, or ``.tsv``. If ``subsets`` is not ``None``, when calling the ``write()`` method, path is assumed to be a string containing the path to the directory to write the feature files with an additional file extension specifying the file type. For example ``/foo/.csv``. :type path: str :param feature_set: The FeatureSet to dump to a file. :type feature_set: FeatureSet :param quiet: Do not print "Writing..." status message to stderr. :type quiet: bool :param requires_binary: Whether or not the Writer must open the file in binary mode for writing with Python 2. :type requires_binary: bool :param subsets: A mapping from subset names to lists of feature names that are included in those sets. If given, a feature file will be written for every subset (with the name containing the subset name as suffix to ``path``). Note, since string- valued features are automatically converted into boolean features with names of the form ``FEATURE_NAME=STRING_VALUE``, when doing the filtering, the portion before the ``=`` is all that's used for matching. Therefore, you do not need to enumerate all of these boolean feature names in your mapping. :type subsets: dict (str to list of str) """ def __init__(self, path, feature_set, **kwargs): super(Writer, self).__init__() self.requires_binary = kwargs.pop('requires_binary', False) self.quiet = kwargs.pop('quiet', True) self.path = path self.feat_set = feature_set self.subsets = kwargs.pop('subsets', None) # Get prefix & extension for checking file types & writing subset files # TODO: Determine if we purposefully used this instead of os.path.split self.root, self.ext = re.search(r'^(.*)(\.[^.]*)$', path).groups() self._progress_msg = '' if kwargs: raise ValueError('Passed extra keyword arguments to ' 'Writer constructor: {}'.format(kwargs)) @classmethod
[docs] def for_path(cls, path, feature_set, **kwargs): """ :param path: A path to the feature file we would like to create. The suffix to this filename must be ``.arff``, ``.csv``, ``.jsonlines``, ``.libsvm``, ``.megam``, ``.ndj``, or ``.tsv``. If ``subsets`` is not ``None``, when calling the ``write()`` method, path is assumed to be a string containing the path to the directory to write the feature files with an additional file extension specifying the file type. For example ``/foo/.csv``. :type path: str :param feature_set: The FeatureSet to dump to a file. :type feature_set: FeatureSet :param kwargs: The keyword arguments for ``for_path`` are the same as the initializer for the desired ``Writer`` subclass. :type kwargs: dict :returns: New instance of the Writer sub-class that is appropriate for the given path. """ # Get lowercase extension for file extension checking ext = '.' + path.rsplit('.', 1)[-1].lower() return EXT_TO_WRITER[ext](path, feature_set, **kwargs)
[docs] def write(self): """ Writes out this Writer's FeatureSet to a file in its format. """ # Setup logger logger = logging.getLogger(__name__) if isinstance(self.feat_set.vectorizer, FeatureHasher): raise ValueError('Writer cannot write sets that use' 'FeatureHasher for vectorization.') # Write one feature file if we weren't given a dict of subsets if self.subsets is None: self._write_subset(self.path, None) # Otherwise write one feature file per subset else: for subset_name, filter_features in iteritems(self.subsets): logger.debug('Subset (%s) features: %s', subset_name, filter_features) sub_path = os.path.join(self.root, '{}{}'.format(subset_name, self.ext)) self._write_subset(sub_path, set(filter_features))
def _write_subset(self, sub_path, filter_features): """ Writes out the given FeatureSet to a file in this class's format. :param sub_path: The path to the file we want to create for this subset of our data. :type sub_path: str :param filter_features: Set of features to include in current feature file. :type filter_features: set of str """ # Setup logger logger = logging.getLogger(__name__) logger.debug('sub_path: %s', sub_path) logger.debug('feature_set: %s', self.feat_set.name) logger.debug('filter_features: %s', filter_features) if not self.quiet: self._progress_msg = "Writing {}...".format(sub_path) print(self._progress_msg, end="\r", file=sys.stderr) sys.stderr.flush() # Apply filtering filtered_set = (self.feat_set.filtered_iter(features=filter_features) if filter_features is not None else self.feat_set) # Open file for writing and write each line file_mode = 'wb' if (self.requires_binary and PY2) else 'w' with open(sub_path, file_mode) as output_file: # Write out the header if this format requires it self._write_header(filtered_set, output_file, filter_features) # Write individual lines for ex_num, (id_, label_, feat_dict) in enumerate(filtered_set): self._write_line(id_, label_, feat_dict, output_file) if not self.quiet and ex_num % 100 == 0: print("{}{:>15}".format(self._progress_msg, ex_num), end="\r", file=sys.stderr) sys.stderr.flush() if not self.quiet: print("{}{:<15}".format(self._progress_msg, "done"), file=sys.stderr) sys.stderr.flush() def _write_header(self, feature_set, output_file, filter_features): """ Called before lines are written to file, so that headers can be written for files that need them. :param feature_set: The FeatureSet being written to a file. :type feature_set: FeatureSet :param output_file: The file being written to. :type output_file: file :param filter_features: If only writing a subset of the features in the FeatureSet to ``output_file``, these are the features to include in this file. :type filter_features: set of str """ pass def _write_line(self, id_, label_, feat_dict, output_file): """ Write the current line in the file in this Writer's format. :param id_: The ID for the current instance. :type id_: str :param label_: The label for the current instance. :type label_: str :param feat_dict: The feature dictionary for the current instance. :type feat_dict: str :param output_file: The file being written to. :type output_file: file """ raise NotImplementedError
[docs]class DelimitedFileWriter(Writer): """ Writer for writing out FeatureSets as TSV/CSV files. :param path: A path to the feature file we would like to create. If ``subsets`` is not ``None``, this is assumed to be a string containing the path to the directory to write the feature files with an additional file extension specifying the file type. For example ``/foo/.csv``. :type path: str :param feature_set: The FeatureSet to dump to a file. :type feature_set: FeatureSet :param quiet: Do not print "Writing..." status message to stderr. :type quiet: bool :param id_col: Name of the column to store the instance IDs in for ARFF, CSV, and TSV files. :type id_col: str :param label_col: Name of the column which contains the class labels for CSV/TSV files. :type label_col: str :param dialect: The dialect to use for the underlying ``csv.DictWriter`` Default: 'excel-tab' :type dialect: str """ def __init__(self, path, feature_set, **kwargs): kwargs['requires_binary'] = True self.dialect = kwargs.pop('dialect', 'excel-tab') self.label_col = kwargs.pop('label_col', 'y') self.id_col = kwargs.pop('id_col', 'id') super(DelimitedFileWriter, self).__init__(path, feature_set, **kwargs) self._dict_writer = None def _get_fieldnames(self, filter_features): """ Build list of fieldnames for DictWriter from self.feat_set. :param filter_features: Set of features to include in current feature file. :type filter_features: set of str """ # Build list of fieldnames (features + id_col + label_col) if filter_features is not None: fieldnames = {feat_name for feat_name in self.feat_set.vectorizer.get_feature_names() if (feat_name in filter_features or feat_name.split('=', 1)[0] in filter_features)} else: fieldnames = set(self.feat_set.vectorizer.get_feature_names()) fieldnames.add(self.id_col) if self.feat_set.has_labels: fieldnames.add(self.label_col) return sorted(fieldnames) def _write_header(self, feature_set, output_file, filter_features): """ Called before lines are written to file, so that headers can be written for files that need them. :param feature_set: The FeatureSet being written to a file. :type feature_set: FeatureSet :param output_file: The file being written to. :type output_file: file :param filter_features: If only writing a subset of the features in the FeatureSet to ``output_file``, these are the features to include in this file. :type filter_features: set of str """ # Initialize DictWriter that will be used to write header and rows self._dict_writer = DictWriter(output_file, self._get_fieldnames(filter_features), restval=0, dialect=self.dialect) # Actually output the header to the file self._dict_writer.writeheader() def _write_line(self, id_, label_, feat_dict, output_file): """ Write the current line in the file in this Writer's format. :param id_: The ID for the current instance. :type id_: str :param label_: The label for the current instance. :type label_: str :param feat_dict: The feature dictionary for the current instance. :type feat_dict: str :param output_file: The file being written to. :type output_file: file """ # Add class column to feat_dict (unless this is unlabelled data) if self.label_col not in feat_dict: if self.feat_set.has_labels: feat_dict[self.label_col] = label_ else: raise ValueError(('Class column name "{}" already used as feature ' 'name.').format(self.label_col)) # Add id column to feat_dict if id is provided if self.id_col not in feat_dict: feat_dict[self.id_col] = id_ else: raise ValueError('ID column name "{}" already used as feature ' 'name.'.format(self.id_col)) # Write out line self._dict_writer.writerow(feat_dict)
[docs]class CSVWriter(DelimitedFileWriter): """ Writer for writing out FeatureSets as CSV files. :param path: A path to the feature file we would like to create. If ``subsets`` is not ``None``, this is assumed to be a string containing the path to the directory to write the feature files with an additional file extension specifying the file type. For example ``/foo/.csv``. :type path: str :param feature_set: The FeatureSet to dump to a file. :type feature_set: FeatureSet :param quiet: Do not print "Writing..." status message to stderr. :type quiet: bool """ def __init__(self, path, feature_set, **kwargs): kwargs['dialect'] = 'excel' super(CSVWriter, self).__init__(path, feature_set, **kwargs) self._dict_writer = None
[docs]class TSVWriter(DelimitedFileWriter): """ Writer for writing out FeatureSets as TSV files. :param path: A path to the feature file we would like to create. If ``subsets`` is not ``None``, this is assumed to be a string containing the path to the directory to write the feature files with an additional file extension specifying the file type. For example ``/foo/.tsv``. :type path: str :param feature_set: The FeatureSet to dump to a file. :type feature_set: FeatureSet :param quiet: Do not print "Writing..." status message to stderr. :type quiet: bool """ def __init__(self, path, feature_set, **kwargs): kwargs['dialect'] = 'excel-tab' super(TSVWriter, self).__init__(path, feature_set, **kwargs) self._dict_writer = None
[docs]class ARFFWriter(DelimitedFileWriter): """ Writer for writing out FeatureSets as ARFF files. :param path: A path to the feature file we would like to create. If ``subsets`` is not ``None``, this is assumed to be a string containing the path to the directory to write the feature files with an additional file extension specifying the file type. For example ``/foo/.arff``. :type path: str :param feature_set: The FeatureSet to dump to a file. :type feature_set: FeatureSet :param requires_binary: Whether or not the Writer must open the file in binary mode for writing with Python 2. :type requires_binary: bool :param quiet: Do not print "Writing..." status message to stderr. :type quiet: bool :param relation: The name of the relation in the ARFF file. Default: ``'skll_relation'`` :type relation: str :param regression: Is this an ARFF file to be used for regression? Default: ``False`` :type regression: bool """ def __init__(self, path, feature_set, **kwargs): self.relation = kwargs.pop('relation', 'skll_relation') self.regression = kwargs.pop('regression', False) kwargs['dialect'] = 'arff' super(ARFFWriter, self).__init__(path, feature_set, **kwargs) self._dict_writer = None def _write_header(self, feature_set, output_file, filter_features): """ Called before lines are written to file, so that headers can be written for files that need them. :param feature_set: The FeatureSet being written to a file. :type feature_set: FeatureSet :param output_file: The file being written to. :type output_file: file :param filter_features: If only writing a subset of the features in the FeatureSet to ``output_file``, these are the features to include in this file. :type filter_features: set of str """ fieldnames = self._get_fieldnames(filter_features) if self.label_col in fieldnames: fieldnames.remove(self.label_col) # Add relation to header print("@relation '{}'\n".format(self.relation), file=output_file) # Loop through fields writing the header info for the ARFF file for field in fieldnames: print("@attribute '{}' numeric".format(field.replace('\\', '\\\\') .replace("'", "\\'")), file=output_file) # Print class label header if necessary if self.regression: print("@attribute {} numeric".format(self.label_col), file=output_file) else: if self.feat_set.has_labels: print("@attribute {} ".format(self.label_col) + "{" + ','.join(map(str, sorted(set(self.feat_set.labels)))) + "}", file=output_file) fieldnames.append(self.label_col) # Create CSV writer to handle missing values for lines in data section # and to ignore the instance values for non-numeric attributes self._dict_writer = DictWriter(output_file, fieldnames, restval=0, extrasaction='ignore', dialect='arff') # Finish header and start data section print("\n@data", file=output_file)
[docs]class MegaMWriter(Writer): """ Writer for writing out FeatureSets as MegaM files. """ @staticmethod def _replace_non_ascii(line): """ :param line: The line to clean up. :type line: str :returns: Copy of line with all non-ASCII characters replaced with <U1234> sequences where 1234 is the value of ord() for the character. """ char_list = [] for char in line: char_num = ord(char) char_list.append( '<U{}>'.format(char_num) if char_num > 127 else char) return ''.join(char_list) def _write_line(self, id_, label_, feat_dict, output_file): """ Write the current line in the file in MegaM format. :param id_: The ID for the current instance. :type id_: str :param label_: The label for the current instance. :type label_: str :param feat_dict: The feature dictionary for the current instance. :type feat_dict: str :param output_file: The file being written to. :type output_file: file """ # Don't try to add class column if this is label-less data print('# {}'.format(id_), file=output_file) if self.feat_set.has_labels: print('{}'.format(label_), end='\t', file=output_file) print(self._replace_non_ascii(' '.join(('{} {}'.format(field, value) for field, value in sorted(feat_dict.items()) if Decimal(value) != 0))), file=output_file)
[docs]class NDJWriter(Writer): """ Writer for writing out FeatureSets as .jsonlines/.ndj files. """ def __init__(self, path, feature_set, **kwargs): kwargs['requires_binary'] = True super(NDJWriter, self).__init__(path, feature_set, **kwargs) def _write_line(self, id_, label_, feat_dict, output_file): """ Write the current line in the file in MegaM format. :param id_: The ID for the current instance. :type id_: str :param label_: The label for the current instance. :type label_: str :param feat_dict: The feature dictionary for the current instance. :type feat_dict: str :param output_file: The file being written to. :type output_file: file """ example_dict = {} # Don't try to add class column if this is label-less data if self.feat_set.has_labels: example_dict['y'] = np.asscalar(label_) example_dict['id'] = np.asscalar(id_) example_dict["x"] = feat_dict print(json.dumps(example_dict, sort_keys=True), file=output_file)
[docs]class LibSVMWriter(Writer): """ Writer for writing out FeatureSets as LibSVM/SVMLight files. """ LIBSVM_REPLACE_DICT = {':': '\u2236', '#': '\uFF03', ' ': '\u2002', '=': '\ua78a', '|': '\u2223'} def __init__(self, path, feature_set, **kwargs): self.label_map = kwargs.pop('label_map', None) super(LibSVMWriter, self).__init__(path, feature_set, **kwargs) if self.label_map is None: self.label_map = {} if feature_set.has_labels: self.label_map = {label: num for num, label in enumerate(sorted({label for label in feature_set.labels if not isinstance(label, (int, float))}))} # Add fake item to vectorizer for None self.label_map[None] = '00000' @staticmethod def _sanitize(name): """ Replace illegal characters in class names with close unicode equivalents to make things loadable in by LibSVM, LibLinear, or SVMLight. """ if isinstance(name, string_types): for orig, replacement in LibSVMWriter.LIBSVM_REPLACE_DICT.items(): name = name.replace(orig, replacement) return name def _write_line(self, id_, label_, feat_dict, output_file): """ Write the current line in the file in MegaM format. :param id_: The ID for the current instance. :type id_: str :param label_: The label for the current instance. :type label_: str :param feat_dict: The feature dictionary for the current instance. :type feat_dict: str :param output_file: The file being written to. :type output_file: file """ field_values = sorted([(self.feat_set.vectorizer.vocabulary_[field] + 1, value) for field, value in iteritems(feat_dict) if Decimal(value) != 0]) # Print label if label_ in self.label_map: print('{}'.format(self.label_map[label_]), end=' ', file=output_file) else: print('{}'.format(label_), end=' ', file=output_file) # Print features print(' '.join(('{}:{}'.format(field, value) for field, value in field_values)), end=' ', file=output_file) # Print comment with id and mappings print('#', end=' ', file=output_file) print(self._sanitize('{}'.format(id_)), end='', file=output_file) print(' |', end=' ', file=output_file) if (PY2 and self.feat_set.has_labels and isinstance(label_, text_type)): label_ = label_.encode('utf-8') if label_ in self.label_map: print('%s=%s' % (self._sanitize(self.label_map[label_]), self._sanitize(label_)), end=' | ', file=output_file) else: print(' |', end=' ', file=output_file) line = ' '.join(('%s=%s' % (self.feat_set.vectorizer.vocabulary_[field] + 1, self._sanitize(field)) for field, value in feat_dict.items() if Decimal(value) != 0)) print(line, file=output_file)
# Constants EXT_TO_WRITER = {".arff": ARFFWriter, ".csv": CSVWriter, ".jsonlines": NDJWriter, ".libsvm": LibSVMWriter, ".megam": MegaMWriter, '.ndj': NDJWriter, ".tsv": TSVWriter}