From a9630472be39ae5c5567b109363e82d634d76a0d Mon Sep 17 00:00:00 2001 From: maximzubkov Date: Sat, 29 May 2021 14:42:53 +0300 Subject: [PATCH 1/3] Fix astminer to code2seq --- code2seq/preprocessing/astminer_to_code2seq.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/code2seq/preprocessing/astminer_to_code2seq.py b/code2seq/preprocessing/astminer_to_code2seq.py index 790440e..77d669f 100644 --- a/code2seq/preprocessing/astminer_to_code2seq.py +++ b/code2seq/preprocessing/astminer_to_code2seq.py @@ -10,7 +10,10 @@ def _get_id2value_from_csv(path_: str) -> Dict[str, str]: - return dict(numpy.genfromtxt(path_, delimiter=",", dtype=(str, str))[1:]) + with open(path_, "r") as f: + lines = f.read().strip().split("\n")[1:] + lines = [line.split(",", maxsplit=1) for line in lines] + return {k: v for k, v in lines} def preprocess_csv(data_folder: str, dataset_name: str, holdout_name: str, is_shuffled: bool): From 1c79985757f218c3da36cf9fbf52e07a989d3d92 Mon Sep 17 00:00:00 2001 From: maximzubkov Date: Sat, 29 May 2021 14:43:54 +0300 Subject: [PATCH 2/3] Fix path to astminer script --- scripts/download_data.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/download_data.sh b/scripts/download_data.sh index 699a435..560886d 100755 --- a/scripts/download_data.sh +++ b/scripts/download_data.sh @@ -9,7 +9,7 @@ LOAD_SPLITTED=false DATA_DIR=./data POJ_DOWNLOAD_SCRIPT=./scripts/download_poj.sh CODEFORCES_DOWNLOAD_SCRIPT=./scripts/download_codeforces.sh -ASTMINER_PATH=../astminer/build/shadow/lib-0.*.jar +ASTMINER_PATH=../astminer/build/shadow/astminer.jar SPLIT_SCRIPT=./scripts/split_dataset.sh function is_int(){ From caf3fc3abcb761a0b42cef6d37ab79013c15f933 Mon Sep 17 00:00:00 2001 From: maximzubkov Date: Sat, 29 May 2021 14:50:13 +0300 Subject: [PATCH 3/3] Fix mypy issues --- code2seq/preprocessing/astminer_to_code2seq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code2seq/preprocessing/astminer_to_code2seq.py b/code2seq/preprocessing/astminer_to_code2seq.py index 77d669f..4d478b1 100644 --- a/code2seq/preprocessing/astminer_to_code2seq.py +++ b/code2seq/preprocessing/astminer_to_code2seq.py @@ -12,8 +12,8 @@ def _get_id2value_from_csv(path_: str) -> Dict[str, str]: with open(path_, "r") as f: lines = f.read().strip().split("\n")[1:] - lines = [line.split(",", maxsplit=1) for line in lines] - return {k: v for k, v in lines} + parsed_lines = [line.split(",", maxsplit=1) for line in lines] + return {k: v for k, v in parsed_lines} def preprocess_csv(data_folder: str, dataset_name: str, holdout_name: str, is_shuffled: bool):