Fix CVE-2024-5206

2024-06-17 14:17:54 +08:00 · 2024-06-17 14:17:54 +08:00 · ff9a954fbf
commit ff9a954fbf
parent c43d3ab394
2 changed files with 210 additions and 3 deletions
--- a/backport-CVE-2024-5206.patch
+++ b/backport-CVE-2024-5206.patch
@ -0,0 +1,203 @@
+From 70ca21f106b603b611da73012c9ade7cd8e438b8 Mon Sep 17 00:00:00 2001
+From: Olivier Grisel <olivier.grisel@ensta.org>
+Date: Mon, 22 Apr 2024 15:10:46 +0200
+Subject: [PATCH] FIX remove the computed stop_words_ attribute of text
+ vectorizer (#28823)
+
+Origin:
+https://github.com/scikit-learn/scikit-learn/commit/70ca21f106b603b611da73012c9ade7cd8e438b8
+---
+ sklearn/feature_extraction/tests/test_text.py | 35 ----------------
+ sklearn/feature_extraction/text.py            | 40 +++----------------
+ 2 files changed, 5 insertions(+), 70 deletions(-)
+
+diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
+index 79c5486..9c4a017 100644
+--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
+@@ -584,14 +584,11 @@ def test_feature_names():
+ @pytest.mark.parametrize('Vectorizer', (CountVectorizer, TfidfVectorizer))
+ def test_vectorizer_max_features(Vectorizer):
+     expected_vocabulary = set(['burger', 'beer', 'salad', 'pizza'])
+-    expected_stop_words = set([u'celeri', u'tomato', u'copyright', u'coke',
+-                               u'sparkling', u'water', u'the'])
+ 
+     # test bounded number of extracted features
+     vectorizer = Vectorizer(max_df=0.6, max_features=4)
+     vectorizer.fit(ALL_FOOD_DOCS)
+     assert_equal(set(vectorizer.vocabulary_), expected_vocabulary)
+-    assert_equal(vectorizer.stop_words_, expected_stop_words)
+ 
+ 
+ def test_count_vectorizer_max_features():
+@@ -626,21 +623,16 @@ def test_vectorizer_max_df():
+     vect.fit(test_data)
+     assert 'a' in vect.vocabulary_.keys()
+     assert_equal(len(vect.vocabulary_.keys()), 6)
+-    assert_equal(len(vect.stop_words_), 0)
+ 
+     vect.max_df = 0.5  # 0.5 * 3 documents -> max_doc_count == 1.5
+     vect.fit(test_data)
+     assert 'a' not in vect.vocabulary_.keys()  # {ae} ignored
+     assert_equal(len(vect.vocabulary_.keys()), 4)    # {bcdt} remain
+-    assert 'a' in vect.stop_words_
+-    assert_equal(len(vect.stop_words_), 2)
+ 
+     vect.max_df = 1
+     vect.fit(test_data)
+     assert 'a' not in vect.vocabulary_.keys()  # {ae} ignored
+     assert_equal(len(vect.vocabulary_.keys()), 4)    # {bcdt} remain
+-    assert 'a' in vect.stop_words_
+-    assert_equal(len(vect.stop_words_), 2)
+ 
+ 
+ def test_vectorizer_min_df():
+@@ -649,21 +641,16 @@ def test_vectorizer_min_df():
+     vect.fit(test_data)
+     assert 'a' in vect.vocabulary_.keys()
+     assert_equal(len(vect.vocabulary_.keys()), 6)
+-    assert_equal(len(vect.stop_words_), 0)
+ 
+     vect.min_df = 2
+     vect.fit(test_data)
+     assert 'c' not in vect.vocabulary_.keys()  # {bcdt} ignored
+     assert_equal(len(vect.vocabulary_.keys()), 2)    # {ae} remain
+-    assert 'c' in vect.stop_words_
+-    assert_equal(len(vect.stop_words_), 4)
+ 
+     vect.min_df = 0.8  # 0.8 * 3 documents -> min_doc_count == 2.4
+     vect.fit(test_data)
+     assert 'c' not in vect.vocabulary_.keys()  # {bcdet} ignored
+     assert_equal(len(vect.vocabulary_.keys()), 1)    # {a} remains
+-    assert 'c' in vect.stop_words_
+-    assert_equal(len(vect.stop_words_), 5)
+ 
+ 
+ def test_count_binary_occurrences():
+@@ -936,28 +923,6 @@ def test_countvectorizer_vocab_dicts_when_pickling():
+         assert_equal(cv.get_feature_names(), unpickled_cv.get_feature_names())
+ 
+ 
+-def test_stop_words_removal():
+-    # Ensure that deleting the stop_words_ attribute doesn't affect transform
+-
+-    fitted_vectorizers = (
+-        TfidfVectorizer().fit(JUNK_FOOD_DOCS),
+-        CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),
+-        CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS)
+-    )
+-
+-    for vect in fitted_vectorizers:
+-        vect_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
+-
+-        vect.stop_words_ = None
+-        stop_None_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
+-
+-        delattr(vect, 'stop_words_')
+-        stop_del_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
+-
+-        assert_array_equal(stop_None_transform, vect_transform)
+-        assert_array_equal(stop_del_transform, vect_transform)
+-
+-
+ def test_pickling_transformer():
+     X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS)
+     orig = TfidfTransformer().fit(X)
+diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
+index 19d5c7f..bd9da0b 100644
+--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
+@@ -795,15 +795,6 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
+     vocabulary_ : dict
+         A mapping of terms to feature indices.
+ 
+-    stop_words_ : set
+-        Terms that were ignored because they either:
+-
+-          - occurred in too many documents (`max_df`)
+-          - occurred in too few documents (`min_df`)
+-          - were cut off by feature selection (`max_features`).
+-
+-        This is only available if no vocabulary was given.
+-
+     Examples
+     --------
+     >>> from sklearn.feature_extraction.text import CountVectorizer
+@@ -827,11 +818,6 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
+     --------
+     HashingVectorizer, TfidfVectorizer
+ 
+-    Notes
+-    -----
+-    The ``stop_words_`` attribute can get large and increase the model size
+-    when pickling. This attribute is provided only for introspection and can
+-    be safely removed using delattr or set to None before pickling.
+     """
+ 
+     def __init__(self, input='content', encoding='utf-8',
+@@ -909,18 +895,16 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
+             mask = new_mask
+ 
+         new_indices = np.cumsum(mask) - 1  # maps old indices to new
+-        removed_terms = set()
+         for term, old_index in list(six.iteritems(vocabulary)):
+             if mask[old_index]:
+                 vocabulary[term] = new_indices[old_index]
+             else:
+                 del vocabulary[term]
+-                removed_terms.add(term)
+         kept_indices = np.where(mask)[0]
+         if len(kept_indices) == 0:
+             raise ValueError("After pruning, no terms remain. Try a lower"
+                              " min_df or a higher max_df.")
+-        return X[:, kept_indices], removed_terms
+        return X[:, kept_indices]
+ 
+     def _count_vocab(self, raw_documents, fixed_vocab):
+         """Create sparse feature matrix, and vocabulary where fixed_vocab=False
+@@ -1046,10 +1030,10 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
+             if max_doc_count < min_doc_count:
+                 raise ValueError(
+                     "max_df corresponds to < documents than min_df")
+-            X, self.stop_words_ = self._limit_features(X, vocabulary,
+-                                                       max_doc_count,
+-                                                       min_doc_count,
+-                                                       max_features)
+            X = self._limit_features(X, vocabulary,
+                                     max_doc_count,
+                                     min_doc_count,
+                                     max_features)
+ 
+             self.vocabulary_ = vocabulary
+ 
+@@ -1459,15 +1443,6 @@ class TfidfVectorizer(CountVectorizer):
+         The inverse document frequency (IDF) vector; only defined
+         if  ``use_idf`` is True.
+ 
+-    stop_words_ : set
+-        Terms that were ignored because they either:
+-
+-          - occurred in too many documents (`max_df`)
+-          - occurred in too few documents (`min_df`)
+-          - were cut off by feature selection (`max_features`).
+-
+-        This is only available if no vocabulary was given.
+-
+     Examples
+     --------
+     >>> from sklearn.feature_extraction.text import TfidfVectorizer
+@@ -1491,11 +1466,6 @@ class TfidfVectorizer(CountVectorizer):
+     TfidfTransformer : Performs the TF-IDF transformation from a provided
+         matrix of counts.
+ 
+-    Notes
+-    -----
+-    The ``stop_words_`` attribute can get large and increase the model size
+-    when pickling. This attribute is provided only for introspection and can
+-    be safely removed using delattr or set to None before pickling.
+     """
+ 
+     def __init__(self, input='content', encoding='utf-8',
+-- 
+2.33.0
+
--- a/python-scikit-learn.spec
+++ b/python-scikit-learn.spec
@ -3,10 +3,11 @@
 Name:           python-scikit-learn
 Summary:        A Python module for machine learning built on top of SciPy
 Version:        0.20.4
-Release:        4
+Release:        5
 License:        BSD
 URL:            https://scikit-learn.org/stable/
 Source0:        https://github.com/scikit-learn/scikit-learn/archive/%{version}/scikit-learn-%{version}.tar.gz
+Patch3000:      backport-CVE-2024-5206.patch

 %global _description\
 scikit-learn is a Python module for machine learning built on top of SciPy\
@ -19,14 +20,14 @@ Summary:        %summary
 %{?python_provide:%python_provide python3-scikit-learn}
 %{?python_provide:%python_provide python3-sklearn}

-BuildRequires:  git python3-devel python3-numpy python3-Cython python3-pytest
+BuildRequires:  python3-devel python3-numpy python3-Cython python3-pytest
 Requires:       python3 >= 3.5 python3-numpy >= 1.11.0
 Requires:       python3-scipy >= 0.17.0 python3-joblib >=  0.11

 %description -n python3-scikit-learn %_description

 %prep
-%autosetup -n scikit-learn-%{version} -p1 -Sgit
+%autosetup -n scikit-learn-%{version} -p1

 %build
 %py3_build
@ -41,6 +42,9 @@ Requires:       python3-scipy >= 0.17.0 python3-joblib >=  0.11
 %{python3_sitearch}/scikit_learn-*.egg-info

 %changelog
+* Mon Jun 17 2024 yaoxin <yao_xin001@hoperun.com> - 0.20.4-5
+- Fix CVE-2024-5206
+
 * Mon Sep 27 2021 lingsheng <lingsheng@huawei.com> - 0.20.4-4
 - Provides python-scikit-learn and python-sklearn for compatibility