From d03f526b6caefbb6e97bfcfa9b0742a2acb6b160 Mon Sep 17 00:00:00 2001 From: Saken-Tukenov Date: Wed, 10 Jan 2024 17:52:50 +0600 Subject: [PATCH] =?UTF-8?q?=D0=A2=D0=BE=D0=BB=D1=8C=D0=BA=D0=BE=20Leipzig?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dataset_preparer.py | 2 +- src/multidomain_kazakh_dataset.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/dataset_preparer.py b/src/dataset_preparer.py index ff29095..7941546 100644 --- a/src/dataset_preparer.py +++ b/src/dataset_preparer.py @@ -68,7 +68,7 @@ def run(self): """Run the dataset preparation process: download, merge, filter, and clean the dataset.""" logging.info("Running dataset preparation for: %s", self.dataset_name) self.download() - self.merge() + # self.merge() self.filter() self.clean() logging.info("Dataset preparation completed for: %s", self.dataset_name) diff --git a/src/multidomain_kazakh_dataset.py b/src/multidomain_kazakh_dataset.py index 57b6f52..84264f5 100644 --- a/src/multidomain_kazakh_dataset.py +++ b/src/multidomain_kazakh_dataset.py @@ -3,11 +3,11 @@ class MultidomainKazakhDataset(DatasetPreparer): def __init__(self): files = [ - 'https://huggingface.co/datasets/kz-transformers/multidomain-kazakh-dataset/resolve/main/kazakhNews.csv', - 'https://huggingface.co/datasets/kz-transformers/multidomain-kazakh-dataset/resolve/main/kazakhBooks.csv', + # 'https://huggingface.co/datasets/kz-transformers/multidomain-kazakh-dataset/resolve/main/kazakhNews.csv', + # 'https://huggingface.co/datasets/kz-transformers/multidomain-kazakh-dataset/resolve/main/kazakhBooks.csv', 'https://huggingface.co/datasets/kz-transformers/multidomain-kazakh-dataset/resolve/main/leipzig.csv', - 'https://huggingface.co/datasets/kz-transformers/multidomain-kazakh-dataset/resolve/main/oscar.csv', - 'https://huggingface.co/datasets/kz-transformers/multidomain-kazakh-dataset/resolve/main/cc100-monolingual-crawled-data.csv' + # 'https://huggingface.co/datasets/kz-transformers/multidomain-kazakh-dataset/resolve/main/oscar.csv', + # 'https://huggingface.co/datasets/kz-transformers/multidomain-kazakh-dataset/resolve/main/cc100-monolingual-crawled-data.csv' ] super().__init__('multidomain-kazakh-dataset', files)