From 70ca21f106b603b611da73012c9ade7cd8e438b8 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 22 Apr 2024 15:10:46 +0200 Subject: [PATCH] FIX remove the computed stop_words_ attribute of text vectorizer (#28823) Origin: https://github.com/scikit-learn/scikit-learn/commit/70ca21f106b603b611da73012c9ade7cd8e438b8 --- sklearn/feature_extraction/tests/test_text.py | 35 ---------------- sklearn/feature_extraction/text.py | 40 +++---------------- 2 files changed, 5 insertions(+), 70 deletions(-) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 79c5486..9c4a017 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -584,14 +584,11 @@ def test_feature_names(): @pytest.mark.parametrize('Vectorizer', (CountVectorizer, TfidfVectorizer)) def test_vectorizer_max_features(Vectorizer): expected_vocabulary = set(['burger', 'beer', 'salad', 'pizza']) - expected_stop_words = set([u'celeri', u'tomato', u'copyright', u'coke', - u'sparkling', u'water', u'the']) # test bounded number of extracted features vectorizer = Vectorizer(max_df=0.6, max_features=4) vectorizer.fit(ALL_FOOD_DOCS) assert_equal(set(vectorizer.vocabulary_), expected_vocabulary) - assert_equal(vectorizer.stop_words_, expected_stop_words) def test_count_vectorizer_max_features(): @@ -626,21 +623,16 @@ def test_vectorizer_max_df(): vect.fit(test_data) assert 'a' in vect.vocabulary_.keys() assert_equal(len(vect.vocabulary_.keys()), 6) - assert_equal(len(vect.stop_words_), 0) vect.max_df = 0.5 # 0.5 * 3 documents -> max_doc_count == 1.5 vect.fit(test_data) assert 'a' not in vect.vocabulary_.keys() # {ae} ignored assert_equal(len(vect.vocabulary_.keys()), 4) # {bcdt} remain - assert 'a' in vect.stop_words_ - assert_equal(len(vect.stop_words_), 2) vect.max_df = 1 vect.fit(test_data) assert 'a' not in vect.vocabulary_.keys() # {ae} ignored assert_equal(len(vect.vocabulary_.keys()), 4) # {bcdt} remain - assert 'a' in vect.stop_words_ - assert_equal(len(vect.stop_words_), 2) def test_vectorizer_min_df(): @@ -649,21 +641,16 @@ def test_vectorizer_min_df(): vect.fit(test_data) assert 'a' in vect.vocabulary_.keys() assert_equal(len(vect.vocabulary_.keys()), 6) - assert_equal(len(vect.stop_words_), 0) vect.min_df = 2 vect.fit(test_data) assert 'c' not in vect.vocabulary_.keys() # {bcdt} ignored assert_equal(len(vect.vocabulary_.keys()), 2) # {ae} remain - assert 'c' in vect.stop_words_ - assert_equal(len(vect.stop_words_), 4) vect.min_df = 0.8 # 0.8 * 3 documents -> min_doc_count == 2.4 vect.fit(test_data) assert 'c' not in vect.vocabulary_.keys() # {bcdet} ignored assert_equal(len(vect.vocabulary_.keys()), 1) # {a} remains - assert 'c' in vect.stop_words_ - assert_equal(len(vect.stop_words_), 5) def test_count_binary_occurrences(): @@ -936,28 +923,6 @@ def test_countvectorizer_vocab_dicts_when_pickling(): assert_equal(cv.get_feature_names(), unpickled_cv.get_feature_names()) -def test_stop_words_removal(): - # Ensure that deleting the stop_words_ attribute doesn't affect transform - - fitted_vectorizers = ( - TfidfVectorizer().fit(JUNK_FOOD_DOCS), - CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS), - CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS) - ) - - for vect in fitted_vectorizers: - vect_transform = vect.transform(JUNK_FOOD_DOCS).toarray() - - vect.stop_words_ = None - stop_None_transform = vect.transform(JUNK_FOOD_DOCS).toarray() - - delattr(vect, 'stop_words_') - stop_del_transform = vect.transform(JUNK_FOOD_DOCS).toarray() - - assert_array_equal(stop_None_transform, vect_transform) - assert_array_equal(stop_del_transform, vect_transform) - - def test_pickling_transformer(): X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS) orig = TfidfTransformer().fit(X) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 19d5c7f..bd9da0b 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -795,15 +795,6 @@ class CountVectorizer(BaseEstimator, VectorizerMixin): vocabulary_ : dict A mapping of terms to feature indices. - stop_words_ : set - Terms that were ignored because they either: - - - occurred in too many documents (`max_df`) - - occurred in too few documents (`min_df`) - - were cut off by feature selection (`max_features`). - - This is only available if no vocabulary was given. - Examples -------- >>> from sklearn.feature_extraction.text import CountVectorizer @@ -827,11 +818,6 @@ class CountVectorizer(BaseEstimator, VectorizerMixin): -------- HashingVectorizer, TfidfVectorizer - Notes - ----- - The ``stop_words_`` attribute can get large and increase the model size - when pickling. This attribute is provided only for introspection and can - be safely removed using delattr or set to None before pickling. """ def __init__(self, input='content', encoding='utf-8', @@ -909,18 +895,16 @@ class CountVectorizer(BaseEstimator, VectorizerMixin): mask = new_mask new_indices = np.cumsum(mask) - 1 # maps old indices to new - removed_terms = set() for term, old_index in list(six.iteritems(vocabulary)): if mask[old_index]: vocabulary[term] = new_indices[old_index] else: del vocabulary[term] - removed_terms.add(term) kept_indices = np.where(mask)[0] if len(kept_indices) == 0: raise ValueError("After pruning, no terms remain. Try a lower" " min_df or a higher max_df.") - return X[:, kept_indices], removed_terms + return X[:, kept_indices] def _count_vocab(self, raw_documents, fixed_vocab): """Create sparse feature matrix, and vocabulary where fixed_vocab=False @@ -1046,10 +1030,10 @@ class CountVectorizer(BaseEstimator, VectorizerMixin): if max_doc_count < min_doc_count: raise ValueError( "max_df corresponds to < documents than min_df") - X, self.stop_words_ = self._limit_features(X, vocabulary, - max_doc_count, - min_doc_count, - max_features) + X = self._limit_features(X, vocabulary, + max_doc_count, + min_doc_count, + max_features) self.vocabulary_ = vocabulary @@ -1459,15 +1443,6 @@ class TfidfVectorizer(CountVectorizer): The inverse document frequency (IDF) vector; only defined if ``use_idf`` is True. - stop_words_ : set - Terms that were ignored because they either: - - - occurred in too many documents (`max_df`) - - occurred in too few documents (`min_df`) - - were cut off by feature selection (`max_features`). - - This is only available if no vocabulary was given. - Examples -------- >>> from sklearn.feature_extraction.text import TfidfVectorizer @@ -1491,11 +1466,6 @@ class TfidfVectorizer(CountVectorizer): TfidfTransformer : Performs the TF-IDF transformation from a provided matrix of counts. - Notes - ----- - The ``stop_words_`` attribute can get large and increase the model size - when pickling. This attribute is provided only for introspection and can - be safely removed using delattr or set to None before pickling. """ def __init__(self, input='content', encoding='utf-8', -- 2.33.0