Fix CVE-2024-5206
This commit is contained in:
parent
c43d3ab394
commit
ff9a954fbf
203
backport-CVE-2024-5206.patch
Normal file
203
backport-CVE-2024-5206.patch
Normal file
@ -0,0 +1,203 @@
|
|||||||
|
From 70ca21f106b603b611da73012c9ade7cd8e438b8 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Olivier Grisel <olivier.grisel@ensta.org>
|
||||||
|
Date: Mon, 22 Apr 2024 15:10:46 +0200
|
||||||
|
Subject: [PATCH] FIX remove the computed stop_words_ attribute of text
|
||||||
|
vectorizer (#28823)
|
||||||
|
|
||||||
|
Origin:
|
||||||
|
https://github.com/scikit-learn/scikit-learn/commit/70ca21f106b603b611da73012c9ade7cd8e438b8
|
||||||
|
---
|
||||||
|
sklearn/feature_extraction/tests/test_text.py | 35 ----------------
|
||||||
|
sklearn/feature_extraction/text.py | 40 +++----------------
|
||||||
|
2 files changed, 5 insertions(+), 70 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
|
||||||
|
index 79c5486..9c4a017 100644
|
||||||
|
--- a/sklearn/feature_extraction/tests/test_text.py
|
||||||
|
+++ b/sklearn/feature_extraction/tests/test_text.py
|
||||||
|
@@ -584,14 +584,11 @@ def test_feature_names():
|
||||||
|
@pytest.mark.parametrize('Vectorizer', (CountVectorizer, TfidfVectorizer))
|
||||||
|
def test_vectorizer_max_features(Vectorizer):
|
||||||
|
expected_vocabulary = set(['burger', 'beer', 'salad', 'pizza'])
|
||||||
|
- expected_stop_words = set([u'celeri', u'tomato', u'copyright', u'coke',
|
||||||
|
- u'sparkling', u'water', u'the'])
|
||||||
|
|
||||||
|
# test bounded number of extracted features
|
||||||
|
vectorizer = Vectorizer(max_df=0.6, max_features=4)
|
||||||
|
vectorizer.fit(ALL_FOOD_DOCS)
|
||||||
|
assert_equal(set(vectorizer.vocabulary_), expected_vocabulary)
|
||||||
|
- assert_equal(vectorizer.stop_words_, expected_stop_words)
|
||||||
|
|
||||||
|
|
||||||
|
def test_count_vectorizer_max_features():
|
||||||
|
@@ -626,21 +623,16 @@ def test_vectorizer_max_df():
|
||||||
|
vect.fit(test_data)
|
||||||
|
assert 'a' in vect.vocabulary_.keys()
|
||||||
|
assert_equal(len(vect.vocabulary_.keys()), 6)
|
||||||
|
- assert_equal(len(vect.stop_words_), 0)
|
||||||
|
|
||||||
|
vect.max_df = 0.5 # 0.5 * 3 documents -> max_doc_count == 1.5
|
||||||
|
vect.fit(test_data)
|
||||||
|
assert 'a' not in vect.vocabulary_.keys() # {ae} ignored
|
||||||
|
assert_equal(len(vect.vocabulary_.keys()), 4) # {bcdt} remain
|
||||||
|
- assert 'a' in vect.stop_words_
|
||||||
|
- assert_equal(len(vect.stop_words_), 2)
|
||||||
|
|
||||||
|
vect.max_df = 1
|
||||||
|
vect.fit(test_data)
|
||||||
|
assert 'a' not in vect.vocabulary_.keys() # {ae} ignored
|
||||||
|
assert_equal(len(vect.vocabulary_.keys()), 4) # {bcdt} remain
|
||||||
|
- assert 'a' in vect.stop_words_
|
||||||
|
- assert_equal(len(vect.stop_words_), 2)
|
||||||
|
|
||||||
|
|
||||||
|
def test_vectorizer_min_df():
|
||||||
|
@@ -649,21 +641,16 @@ def test_vectorizer_min_df():
|
||||||
|
vect.fit(test_data)
|
||||||
|
assert 'a' in vect.vocabulary_.keys()
|
||||||
|
assert_equal(len(vect.vocabulary_.keys()), 6)
|
||||||
|
- assert_equal(len(vect.stop_words_), 0)
|
||||||
|
|
||||||
|
vect.min_df = 2
|
||||||
|
vect.fit(test_data)
|
||||||
|
assert 'c' not in vect.vocabulary_.keys() # {bcdt} ignored
|
||||||
|
assert_equal(len(vect.vocabulary_.keys()), 2) # {ae} remain
|
||||||
|
- assert 'c' in vect.stop_words_
|
||||||
|
- assert_equal(len(vect.stop_words_), 4)
|
||||||
|
|
||||||
|
vect.min_df = 0.8 # 0.8 * 3 documents -> min_doc_count == 2.4
|
||||||
|
vect.fit(test_data)
|
||||||
|
assert 'c' not in vect.vocabulary_.keys() # {bcdet} ignored
|
||||||
|
assert_equal(len(vect.vocabulary_.keys()), 1) # {a} remains
|
||||||
|
- assert 'c' in vect.stop_words_
|
||||||
|
- assert_equal(len(vect.stop_words_), 5)
|
||||||
|
|
||||||
|
|
||||||
|
def test_count_binary_occurrences():
|
||||||
|
@@ -936,28 +923,6 @@ def test_countvectorizer_vocab_dicts_when_pickling():
|
||||||
|
assert_equal(cv.get_feature_names(), unpickled_cv.get_feature_names())
|
||||||
|
|
||||||
|
|
||||||
|
-def test_stop_words_removal():
|
||||||
|
- # Ensure that deleting the stop_words_ attribute doesn't affect transform
|
||||||
|
-
|
||||||
|
- fitted_vectorizers = (
|
||||||
|
- TfidfVectorizer().fit(JUNK_FOOD_DOCS),
|
||||||
|
- CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),
|
||||||
|
- CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS)
|
||||||
|
- )
|
||||||
|
-
|
||||||
|
- for vect in fitted_vectorizers:
|
||||||
|
- vect_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
|
||||||
|
-
|
||||||
|
- vect.stop_words_ = None
|
||||||
|
- stop_None_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
|
||||||
|
-
|
||||||
|
- delattr(vect, 'stop_words_')
|
||||||
|
- stop_del_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
|
||||||
|
-
|
||||||
|
- assert_array_equal(stop_None_transform, vect_transform)
|
||||||
|
- assert_array_equal(stop_del_transform, vect_transform)
|
||||||
|
-
|
||||||
|
-
|
||||||
|
def test_pickling_transformer():
|
||||||
|
X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS)
|
||||||
|
orig = TfidfTransformer().fit(X)
|
||||||
|
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
|
||||||
|
index 19d5c7f..bd9da0b 100644
|
||||||
|
--- a/sklearn/feature_extraction/text.py
|
||||||
|
+++ b/sklearn/feature_extraction/text.py
|
||||||
|
@@ -795,15 +795,6 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
|
||||||
|
vocabulary_ : dict
|
||||||
|
A mapping of terms to feature indices.
|
||||||
|
|
||||||
|
- stop_words_ : set
|
||||||
|
- Terms that were ignored because they either:
|
||||||
|
-
|
||||||
|
- - occurred in too many documents (`max_df`)
|
||||||
|
- - occurred in too few documents (`min_df`)
|
||||||
|
- - were cut off by feature selection (`max_features`).
|
||||||
|
-
|
||||||
|
- This is only available if no vocabulary was given.
|
||||||
|
-
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
@@ -827,11 +818,6 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
|
||||||
|
--------
|
||||||
|
HashingVectorizer, TfidfVectorizer
|
||||||
|
|
||||||
|
- Notes
|
||||||
|
- -----
|
||||||
|
- The ``stop_words_`` attribute can get large and increase the model size
|
||||||
|
- when pickling. This attribute is provided only for introspection and can
|
||||||
|
- be safely removed using delattr or set to None before pickling.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, input='content', encoding='utf-8',
|
||||||
|
@@ -909,18 +895,16 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
|
||||||
|
mask = new_mask
|
||||||
|
|
||||||
|
new_indices = np.cumsum(mask) - 1 # maps old indices to new
|
||||||
|
- removed_terms = set()
|
||||||
|
for term, old_index in list(six.iteritems(vocabulary)):
|
||||||
|
if mask[old_index]:
|
||||||
|
vocabulary[term] = new_indices[old_index]
|
||||||
|
else:
|
||||||
|
del vocabulary[term]
|
||||||
|
- removed_terms.add(term)
|
||||||
|
kept_indices = np.where(mask)[0]
|
||||||
|
if len(kept_indices) == 0:
|
||||||
|
raise ValueError("After pruning, no terms remain. Try a lower"
|
||||||
|
" min_df or a higher max_df.")
|
||||||
|
- return X[:, kept_indices], removed_terms
|
||||||
|
+ return X[:, kept_indices]
|
||||||
|
|
||||||
|
def _count_vocab(self, raw_documents, fixed_vocab):
|
||||||
|
"""Create sparse feature matrix, and vocabulary where fixed_vocab=False
|
||||||
|
@@ -1046,10 +1030,10 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
|
||||||
|
if max_doc_count < min_doc_count:
|
||||||
|
raise ValueError(
|
||||||
|
"max_df corresponds to < documents than min_df")
|
||||||
|
- X, self.stop_words_ = self._limit_features(X, vocabulary,
|
||||||
|
- max_doc_count,
|
||||||
|
- min_doc_count,
|
||||||
|
- max_features)
|
||||||
|
+ X = self._limit_features(X, vocabulary,
|
||||||
|
+ max_doc_count,
|
||||||
|
+ min_doc_count,
|
||||||
|
+ max_features)
|
||||||
|
|
||||||
|
self.vocabulary_ = vocabulary
|
||||||
|
|
||||||
|
@@ -1459,15 +1443,6 @@ class TfidfVectorizer(CountVectorizer):
|
||||||
|
The inverse document frequency (IDF) vector; only defined
|
||||||
|
if ``use_idf`` is True.
|
||||||
|
|
||||||
|
- stop_words_ : set
|
||||||
|
- Terms that were ignored because they either:
|
||||||
|
-
|
||||||
|
- - occurred in too many documents (`max_df`)
|
||||||
|
- - occurred in too few documents (`min_df`)
|
||||||
|
- - were cut off by feature selection (`max_features`).
|
||||||
|
-
|
||||||
|
- This is only available if no vocabulary was given.
|
||||||
|
-
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
@@ -1491,11 +1466,6 @@ class TfidfVectorizer(CountVectorizer):
|
||||||
|
TfidfTransformer : Performs the TF-IDF transformation from a provided
|
||||||
|
matrix of counts.
|
||||||
|
|
||||||
|
- Notes
|
||||||
|
- -----
|
||||||
|
- The ``stop_words_`` attribute can get large and increase the model size
|
||||||
|
- when pickling. This attribute is provided only for introspection and can
|
||||||
|
- be safely removed using delattr or set to None before pickling.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, input='content', encoding='utf-8',
|
||||||
|
--
|
||||||
|
2.33.0
|
||||||
|
|
||||||
@ -3,10 +3,11 @@
|
|||||||
Name: python-scikit-learn
|
Name: python-scikit-learn
|
||||||
Summary: A Python module for machine learning built on top of SciPy
|
Summary: A Python module for machine learning built on top of SciPy
|
||||||
Version: 0.20.4
|
Version: 0.20.4
|
||||||
Release: 4
|
Release: 5
|
||||||
License: BSD
|
License: BSD
|
||||||
URL: https://scikit-learn.org/stable/
|
URL: https://scikit-learn.org/stable/
|
||||||
Source0: https://github.com/scikit-learn/scikit-learn/archive/%{version}/scikit-learn-%{version}.tar.gz
|
Source0: https://github.com/scikit-learn/scikit-learn/archive/%{version}/scikit-learn-%{version}.tar.gz
|
||||||
|
Patch3000: backport-CVE-2024-5206.patch
|
||||||
|
|
||||||
%global _description\
|
%global _description\
|
||||||
scikit-learn is a Python module for machine learning built on top of SciPy\
|
scikit-learn is a Python module for machine learning built on top of SciPy\
|
||||||
@ -19,14 +20,14 @@ Summary: %summary
|
|||||||
%{?python_provide:%python_provide python3-scikit-learn}
|
%{?python_provide:%python_provide python3-scikit-learn}
|
||||||
%{?python_provide:%python_provide python3-sklearn}
|
%{?python_provide:%python_provide python3-sklearn}
|
||||||
|
|
||||||
BuildRequires: git python3-devel python3-numpy python3-Cython python3-pytest
|
BuildRequires: python3-devel python3-numpy python3-Cython python3-pytest
|
||||||
Requires: python3 >= 3.5 python3-numpy >= 1.11.0
|
Requires: python3 >= 3.5 python3-numpy >= 1.11.0
|
||||||
Requires: python3-scipy >= 0.17.0 python3-joblib >= 0.11
|
Requires: python3-scipy >= 0.17.0 python3-joblib >= 0.11
|
||||||
|
|
||||||
%description -n python3-scikit-learn %_description
|
%description -n python3-scikit-learn %_description
|
||||||
|
|
||||||
%prep
|
%prep
|
||||||
%autosetup -n scikit-learn-%{version} -p1 -Sgit
|
%autosetup -n scikit-learn-%{version} -p1
|
||||||
|
|
||||||
%build
|
%build
|
||||||
%py3_build
|
%py3_build
|
||||||
@ -41,6 +42,9 @@ Requires: python3-scipy >= 0.17.0 python3-joblib >= 0.11
|
|||||||
%{python3_sitearch}/scikit_learn-*.egg-info
|
%{python3_sitearch}/scikit_learn-*.egg-info
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Mon Jun 17 2024 yaoxin <yao_xin001@hoperun.com> - 0.20.4-5
|
||||||
|
- Fix CVE-2024-5206
|
||||||
|
|
||||||
* Mon Sep 27 2021 lingsheng <lingsheng@huawei.com> - 0.20.4-4
|
* Mon Sep 27 2021 lingsheng <lingsheng@huawei.com> - 0.20.4-4
|
||||||
- Provides python-scikit-learn and python-sklearn for compatibility
|
- Provides python-scikit-learn and python-sklearn for compatibility
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user