diff --git a/src/dataset_preparer.py b/src/dataset_preparer.py index ff29095..7941546 100644 --- a/src/dataset_preparer.py +++ b/src/dataset_preparer.py @@ -68,7 +68,7 @@ def run(self): """Run the dataset preparation process: download, merge, filter, and clean the dataset.""" logging.info("Running dataset preparation for: %s", self.dataset_name) self.download() - self.merge() + # self.merge() self.filter() self.clean() logging.info("Dataset preparation completed for: %s", self.dataset_name) diff --git a/src/multidomain_kazakh_dataset.py b/src/multidomain_kazakh_dataset.py index 57b6f52..84264f5 100644 --- a/src/multidomain_kazakh_dataset.py +++ b/src/multidomain_kazakh_dataset.py @@ -3,11 +3,11 @@ class MultidomainKazakhDataset(DatasetPreparer): def __init__(self): files = [ - 'https://huggingface.co/datasets/kz-transformers/multidomain-kazakh-dataset/resolve/main/kazakhNews.csv', - 'https://huggingface.co/datasets/kz-transformers/multidomain-kazakh-dataset/resolve/main/kazakhBooks.csv', + # 'https://huggingface.co/datasets/kz-transformers/multidomain-kazakh-dataset/resolve/main/kazakhNews.csv', + # 'https://huggingface.co/datasets/kz-transformers/multidomain-kazakh-dataset/resolve/main/kazakhBooks.csv', 'https://huggingface.co/datasets/kz-transformers/multidomain-kazakh-dataset/resolve/main/leipzig.csv', - 'https://huggingface.co/datasets/kz-transformers/multidomain-kazakh-dataset/resolve/main/oscar.csv', - 'https://huggingface.co/datasets/kz-transformers/multidomain-kazakh-dataset/resolve/main/cc100-monolingual-crawled-data.csv' + # 'https://huggingface.co/datasets/kz-transformers/multidomain-kazakh-dataset/resolve/main/oscar.csv', + # 'https://huggingface.co/datasets/kz-transformers/multidomain-kazakh-dataset/resolve/main/cc100-monolingual-crawled-data.csv' ] super().__init__('multidomain-kazakh-dataset', files)