Source code for skll.data.featureset

# License: BSD 3 clause
"""
Classes related to storing/merging feature sets.

:author: Dan Blanchard (dblanchard@ets.org)
:organization: ETS
"""

from __future__ import absolute_import, print_function, unicode_literals

from copy import deepcopy

import numpy as np
import scipy.sparse as sp
from six import iteritems
from six.moves import zip
from sklearn.feature_extraction import DictVectorizer, FeatureHasher

from skll.data.dict_vectorizer import DictVectorizer as NewDictVectorizer


[docs]class FeatureSet(object): """ Encapsulation of all of the features, values, and metadata about a given set of data. .. warning:: FeatureSets can only be equal if the order of the instances is identical because these are stored as lists/arrays. This replaces ``ExamplesTuple`` from older versions. :param name: The name of this feature set. :type name: str :param ids: Example IDs for this set. :type ids: np.array :param labels: labels for this set. :type labels: np.array :param features: The features for each instance represented as either a list of dictionaries or an array-like (if `vectorizer` is also specified). :type features: list of dict or array-like :param vectorizer: Vectorizer that created feature matrix. :type vectorizer: DictVectorizer or FeatureHasher .. note:: If ids, labels, and/or features are not None, the number of rows in each array must be equal. """ def __init__(self, name, ids, labels=None, features=None, vectorizer=None): super(FeatureSet, self).__init__() self.name = name if isinstance(ids, list): ids = np.array(ids) self.ids = ids if isinstance(labels, list): labels = np.array(labels) self.labels = labels self.features = features self.vectorizer = vectorizer # Convert list of dicts to numpy array if isinstance(self.features, list): if self.vectorizer is None: self.vectorizer = NewDictVectorizer(sparse=True) self.features = self.vectorizer.fit_transform(self.features) if self.features is not None: num_feats = self.features.shape[0] if self.ids is None: raise ValueError('A list of IDs is required') num_ids = self.ids.shape[0] if num_feats != num_ids: raise ValueError(('Number of IDs (%s) does not equal ' 'number of feature rows (%s)') % (num_ids, num_feats)) if self.labels is None: self.labels = np.empty(num_feats) self.labels.fill(None) num_labels = self.labels.shape[0] if num_feats != num_labels: raise ValueError(('Number of labels (%s) does not equal ' 'number of feature rows (%s)') % (num_labels, num_feats)) def __contains__(self, value): """ Check if example ID is in set """ return value in self.ids def __eq__(self, other): """ Check whether two featuresets are the same. .. note:: We consider feature values to be equal if any differences are in the sixth decimal place or higher. """ # We need to sort the indices for the underlying # feature sparse matrix in case we haven't done # so already. if not self.features.has_sorted_indices: self.features.sort_indices() if not other.features.has_sorted_indices: other.features.sort_indices() return (self.ids.shape == other.ids.shape and self.labels.shape == other.labels.shape and self.features.shape == other.features.shape and (self.ids == other.ids).all() and (self.labels == other.labels).all() and np.allclose(self.features.data, other.features.data, rtol=1e-6) and (self.features.indices == other.features.indices).all() and (self.features.indptr == other.features.indptr).all() and self.vectorizer == other.vectorizer) def __iter__(self): """ Iterate through (ID, label, feature_dict) tuples in feature set. """ if self.features is not None: if not isinstance(self.vectorizer, DictVectorizer): raise ValueError('FeatureSets can only be iterated through if ' 'they use a DictVectorizer for their feature ' 'vectorizer.') for id_, label_, feats in zip(self.ids, self.labels, self.features): # When calling inverse_transform we have to add [0] to get the # results for the current instance because it always returns a # 2D array yield (id_, label_, self.vectorizer.inverse_transform(feats)[0]) else: return def __len__(self): return self.features.shape[0] def __add__(self, other): """ Combine two feature sets to create a new one. This is done assuming they both have the same instances with the same IDs in the same order. """ # Check that the sets of IDs are equal if set(self.ids) != set(other.ids): raise ValueError('IDs are not in the same order in each ' 'feature set') # Compute the relative ordering of IDs for merging the features # and labels. ids_indices = dict((y, x) for x, y in enumerate(other.ids)) relative_order = [ids_indices[self_id] for self_id in self.ids] # Initialize the new feature set with a name and the IDs. new_set = FeatureSet('+'.join(sorted([self.name, other.name])), deepcopy(self.ids)) # Combine feature matrices and vectorizers. if not isinstance(self.vectorizer, type(other.vectorizer)): raise ValueError('Cannot combine FeatureSets because they are ' 'not both using the same type of feature ' 'vectorizer (e.g., DictVectorizer, ' 'FeatureHasher)') uses_feature_hasher = isinstance(self.vectorizer, FeatureHasher) if uses_feature_hasher: if (self.vectorizer.n_features != other.vectorizer.n_features): raise ValueError('Cannot combine FeatureSets that uses ' 'FeatureHashers with different values of ' 'n_features setting.') else: # Check for duplicate feature names. if (set(self.vectorizer.feature_names_) & set(other.vectorizer.feature_names_)): raise ValueError('Cannot combine FeatureSets because they ' 'have duplicate feature names.') num_feats = self.features.shape[1] new_set.features = sp.hstack([self.features, other.features[relative_order]], 'csr') new_set.vectorizer = deepcopy(self.vectorizer) if not uses_feature_hasher: for feat_name, index in other.vectorizer.vocabulary_.items(): new_set.vectorizer.vocabulary_[feat_name] = (index + num_feats) other_names = other.vectorizer.feature_names_ new_set.vectorizer.feature_names_.extend(other_names) # If either set has labels, check that they don't conflict. if self.has_labels: # labels should be the same for each FeatureSet, so store once. if other.has_labels and \ not np.all(self.labels == other.labels[relative_order]): raise ValueError('Feature sets have conflicting labels for ' 'examples with the same ID.') new_set.labels = deepcopy(self.labels) else: new_set.labels = deepcopy(other.labels[relative_order]) return new_set
[docs] def filter(self, ids=None, labels=None, features=None, inverse=False): """ Removes or keeps features and/or examples from the Featureset depending on the passed in parameters. :param ids: Examples to keep in the FeatureSet. If `None`, no ID filtering takes place. :type ids: list of str/float :param labels: labels that we want to retain examples for. If `None`, no label filtering takes place. :type labels: list of str/float :param features: Features to keep in the FeatureSet. To help with filtering string-valued features that were converted to sequences of boolean features when read in, any features in the FeatureSet that contain a `=` will be split on the first occurrence and the prefix will be checked to see if it is in `features`. If `None`, no feature filtering takes place. Cannot be used if FeatureSet uses a FeatureHasher for vectorization. :type features: list of str :param inverse: Instead of keeping features and/or examples in lists, remove them. :type inverse: bool """ # Construct mask that indicates which examples to keep mask = np.ones(len(self), dtype=bool) if ids is not None: mask = np.logical_and(mask, np.in1d(self.ids, ids)) if labels is not None: mask = np.logical_and(mask, np.in1d(self.labels, labels)) if inverse and (labels is not None or ids is not None): mask = np.logical_not(mask) # Remove examples not in mask self.ids = self.ids[mask] self.labels = self.labels[mask] self.features = self.features[mask, :] # Filter features if features is not None: if isinstance(self.vectorizer, FeatureHasher): raise ValueError('FeatureSets with FeatureHasher vectorizers' ' cannot be filtered by feature.') columns = np.array(sorted({feat_num for feat_name, feat_num in iteritems(self.vectorizer.vocabulary_) if (feat_name in features or feat_name.split('=', 1)[0] in features)})) if inverse: all_columns = np.arange(self.features.shape[1]) columns = all_columns[np.logical_not(np.in1d(all_columns, columns))] self.features = self.features[:, columns] self.vectorizer.restrict(columns, indices=True)
[docs] def filtered_iter(self, ids=None, labels=None, features=None, inverse=False): """ A version of ``__iter__`` that retains only the specified features and/or examples from the output. :param ids: Examples in the FeatureSet to keep. If `None`, no ID filtering takes place. :type ids: list of str/float :param labels: labels that we want to retain examples for. If `None`, no label filtering takes place. :type labels: list of str/float :param features: Features in the FeatureSet to keep. To help with filtering string-valued features that were converted to sequences of boolean features when read in, any features in the FeatureSet that contain a `=` will be split on the first occurrence and the prefix will be checked to see if it is in `features`. If `None`, no feature filtering takes place. Cannot be used if FeatureSet uses a FeatureHasher for vectorization. :type features: list of str :param inverse: Instead of keeping features and/or examples in lists, remove them. :type inverse: bool """ if self.features is not None and not isinstance(self.vectorizer, DictVectorizer): raise ValueError('FeatureSets can only be iterated through if they' ' use a DictVectorizer for their feature ' 'vectorizer.') for id_, label_, feats in zip(self.ids, self.labels, self.features): # Skip instances with IDs not in filter if ids is not None and (id_ in ids) == inverse: continue # Skip instances with labels not in filter if labels is not None and (label_ in labels) == inverse: continue feat_dict = self.vectorizer.inverse_transform(feats)[0] if features is not None: feat_dict = {name: value for name, value in iteritems(feat_dict) if (inverse != (name in features or name.split('=', 1)[0] in features))} elif not inverse: feat_dict = {} yield id_, label_, feat_dict
def __sub__(self, other): """ :returns: a copy of ``self`` with all features in ``other`` removed. """ new_set = deepcopy(self) new_set.filter(features=other.vectorizer.feature_names_, inverse=True) return new_set @property def has_labels(self): """ :returns: Whether or not this FeatureSet has any finite labels. """ if self.labels is not None: return not (np.issubdtype(self.labels.dtype, float) and np.isnan(np.min(self.labels))) else: return False def __str__(self): """ :returns: a string representation of FeatureSet """ return str(self.__dict__) def __repr__(self): """ :returns: a string representation of FeatureSet """ return repr(self.__dict__) def __getitem__(self, value): """ :returns: A specific example by row number, or if given a slice, a new FeatureSet containing a subset of the data. """ # Check if we're slicing if isinstance(value, slice): sliced_ids = self.ids[value] sliced_feats = (self.features[value] if self.features is not None else None) sliced_labels = (self.labels[value] if self.labels is not None else None) return FeatureSet('{}_{}'.format(self.name, value), sliced_ids, features=sliced_feats, labels=sliced_labels, vectorizer=self.vectorizer) else: label = self.labels[value] if self.labels is not None else None feats = self.features[value, :] features = (self.vectorizer.inverse_transform(feats)[0] if self.features is not None else {}) return self.ids[value], label, features @staticmethod
[docs] def from_data_frame(df, name, labels_column=None, vectorizer=None): ''' Helper function to create a FeatureSet object from a `pandas.DataFrame`. Will raise an Exception if pandas is not installed in your environment. `FeatureSet` `ids` will be the index on `df`. :param df: The pandas.DataFrame object you'd like to use as a feature set. :type df: pandas.DataFrame :param name: The name of this feature set. :type name: str :param labels_column: The name of the column containing the labels (data to predict). :type labels_column: str or None :param vectorizer: Vectorizer that created feature matrix. :type vectorizer: DictVectorizer or FeatureHasher ''' if labels_column: feature_columns = [column for column in df.columns if column != labels_column] labels = df[labels_column].tolist() else: feature_columns = df.columns labels = None features = df[feature_columns].to_dict(orient='records') return FeatureSet(name, ids=df.index.tolist(), labels=labels, features=features, vectorizer=vectorizer)