From 32aa18022a636166e1cbbea93811eb8bec622a00 Mon Sep 17 00:00:00 2001
From: Artem Ryblov <ryblovartem@gmail.com>
Date: Tue, 6 Feb 2024 09:37:54 +0300
Subject: [PATCH] fixing issues with explore_data.py file

---
 ml/guides/text_classification/explore_data.py | 90 +++++++++++--------
 1 file changed, 52 insertions(+), 38 deletions(-)

diff --git a/ml/guides/text_classification/explore_data.py b/ml/guides/text_classification/explore_data.py
index cbf454f..b86977d 100644
--- a/ml/guides/text_classification/explore_data.py
+++ b/ml/guides/text_classification/explore_data.py
@@ -2,6 +2,7 @@
 
 Contains functions to help study, visualize and understand datasets.
 """
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -18,8 +19,8 @@ def get_num_classes(labels):
 
     # Arguments
         labels: list, label values.
-            There should be at lease one sample for values in the
-            range (0, num_classes -1)
+            There should be at least one sample for values in the
+            range(0, num_classes - 1)
 
     # Returns
         int, total number of classes.
@@ -31,17 +32,21 @@ def get_num_classes(labels):
     num_classes = max(labels) + 1
     missing_classes = [i for i in range(num_classes) if i not in labels]
     if len(missing_classes):
-        raise ValueError('Missing samples with label value(s) '
-                         '{missing_classes}. Please make sure you have '
-                         'at least one sample for every label value '
-                         'in the range(0, {max_class})'.format(
-                            missing_classes=missing_classes,
-                            max_class=num_classes - 1))
+        raise ValueError(
+            "Missing samples with label value(s) "
+            "{missing_classes}. Please make sure you have "
+            "at least one sample for every label value "
+            "in the range(0, {max_class})".format(
+                missing_classes=missing_classes, max_class=num_classes - 1
+            )
+        )
 
     if num_classes <= 1:
-        raise ValueError('Invalid number of labels: {num_classes}.'
-                         'Please make sure there are at least two classes '
-                         'of samples'.format(num_classes=num_classes))
+        raise ValueError(
+            "Invalid number of labels: {num_classes}."
+            "Please make sure there are at least two classes "
+            "of samples".format(num_classes=num_classes)
+        )
     return num_classes
 
 
@@ -58,36 +63,36 @@ def get_num_words_per_sample(sample_texts):
     return np.median(num_words)
 
 
-def plot_frequency_distribution_of_ngrams(sample_texts,
-                                          ngram_range=(1, 2),
-                                          num_ngrams=50):
+def plot_frequency_distribution_of_ngrams(
+    sample_texts, ngram_range=(1, 1), num_ngrams=50
+):
     """Plots the frequency distribution of n-grams.
 
     # Arguments
         samples_texts: list, sample texts.
-        ngram_range: tuple (min, mplt), The range of n-gram values to consider.
-            Min and mplt are the lower and upper bound values for the range.
+        ngram_range: tuple (min, max), The range of n-gram values to consider.
+            min and max are the lower and the upper bound values for the range.
         num_ngrams: int, number of n-grams to plot.
             Top `num_ngrams` frequent n-grams will be plotted.
     """
     # Create args required for vectorizing.
     kwargs = {
-            'ngram_range': (1, 1),
-            'dtype': 'int32',
-            'strip_accents': 'unicode',
-            'decode_error': 'replace',
-            'analyzer': 'word',  # Split text into word tokens.
+        "ngram_range": ngram_range,
+        "dtype": "int32",
+        "strip_accents": "unicode",
+        "decode_error": "replace",
+        "analyzer": "word",  # Split text into word tokens.
     }
     vectorizer = CountVectorizer(**kwargs)
 
     # This creates a vocabulary (dict, where keys are n-grams and values are
-    # idxices). This also converts every text to an array the length of
-    # vocabulary, where every element idxicates the count of the n-gram
-    # corresponding at that idxex in vocabulary.
+    # indices). This also converts every text to an array the length of
+    # vocabulary, where every element represents the count of the n-gram
+    # corresponding at that index in vocabulary.
     vectorized_texts = vectorizer.fit_transform(sample_texts)
 
     # This is the list of all n-grams in the index order from the vocabulary.
-    all_ngrams = list(vectorizer.get_feature_names())
+    all_ngrams = list(vectorizer.get_feature_names_out())
     num_ngrams = min(num_ngrams, len(all_ngrams))
     # ngrams = all_ngrams[:num_ngrams]
 
@@ -95,16 +100,25 @@ def plot_frequency_distribution_of_ngrams(sample_texts,
     all_counts = vectorized_texts.sum(axis=0).tolist()[0]
 
     # Sort n-grams and counts by frequency and get top `num_ngrams` ngrams.
-    all_counts, all_ngrams = zip(*[(c, n) for c, n in sorted(
-        zip(all_counts, all_ngrams), reverse=True)])
+    all_counts, all_ngrams = zip(
+        *[(c, n) for c, n in sorted(zip(all_counts, all_ngrams), reverse=True)]
+    )
     ngrams = list(all_ngrams)[:num_ngrams]
     counts = list(all_counts)[:num_ngrams]
 
     idx = np.arange(num_ngrams)
-    plt.bar(idx, counts, width=0.8, color='b')
-    plt.xlabel('N-grams')
-    plt.ylabel('Frequencies')
-    plt.title('Frequency distribution of n-grams')
+
+    f, ax = plt.subplots(
+        figsize=(12, 5)
+    )
+    plt.bar(idx, counts, width=0.8, color="b")
+    plt.xlabel("Top {num_ngrams} N-grams".format(num_ngrams=num_ngrams))
+    plt.ylabel("Frequencies")
+    plt.title(
+        "Frequency distribution of n-grams with range={ngram_range}".format(
+            ngram_range=ngram_range
+        )
+    )
     plt.xticks(idx, ngrams, rotation=45)
     plt.show()
 
@@ -116,9 +130,9 @@ def plot_sample_length_distribution(sample_texts):
         samples_texts: list, sample texts.
     """
     plt.hist([len(s) for s in sample_texts], 50)
-    plt.xlabel('Length of a sample')
-    plt.ylabel('Number of samples')
-    plt.title('Sample length distribution')
+    plt.xlabel("Length of a sample")
+    plt.ylabel("Number of samples")
+    plt.title("Sample length distribution")
     plt.show()
 
 
@@ -134,9 +148,9 @@ def plot_class_distribution(labels):
     count_map = Counter(labels)
     counts = [count_map[i] for i in range(num_classes)]
     idx = np.arange(num_classes)
-    plt.bar(idx, counts, width=0.8, color='b')
-    plt.xlabel('Class')
-    plt.ylabel('Number of samples')
-    plt.title('Class distribution')
+    plt.bar(idx, counts, width=0.8, color="b")
+    plt.xlabel("Class")
+    plt.ylabel("Number of samples")
+    plt.title("Class distribution")
     plt.xticks(idx, idx)
     plt.show()