#348 merge existing and new annotation fields

GateNLP · May 26, 2023 · 101acd2 · 101acd2
1 parent 416083c
commit 101acd2
Show file tree

Hide file tree

Showing 2 changed files with 98 additions and 26 deletions.
diff --git a/backend/models.py b/backend/models.py
@@ -969,28 +969,31 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True):
         # Create dictionary for document
         doc_dict = None
         if json_format == "raw" or json_format == "csv":
-            doc_dict = self.data
+            doc_dict = dict(self.data)
         elif json_format == "gate":
             # GATE json format are expected to have an existing "features" field
-            features_dict = self.data["features"] if "features" in self.data and isinstance(self.data["features"], dict) else {}
+            features_dict = dict(self.data["features"]) if "features" in self.data and isinstance(self.data["features"], dict) else {}
 
             # Add any non-compliant top-level fields into the "features" field instead
-            ignore_keys = {"text", "features", self.project.document_id_field}
+            ignore_keys = {"text", "features", "offset_type", "annotation_sets", self.project.document_id_field}
             features_dict.update({key: value for key, value in self.data.items() if key not in ignore_keys})
 
             doc_dict = {
                 "text": self.data["text"],
                 "features": features_dict,
-                "offset_type": "p",
+                "offset_type": self.data["offset_type"] if "offset_type" in self.data else "p",  # Use original offset type
                 "name": get_value_from_key_path(self.data, self.project.document_id_field)
             }
 
 
+
+
         # Insert annotation sets into the doc dict
         annotations = self.annotations.filter(status=Annotation.COMPLETED)
         if json_format == "csv":
+            # Gets pre-existing annotations
+            annotation_sets = dict(self.data["annotations"]) if "annotations" in self.data else {}
             # Format annotations for CSV export
-            annotation_sets = {}
             for annotation in annotations:
                 a_data = annotation.data
                 annotation_dict = {}
@@ -1010,8 +1013,9 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True):
             doc_dict["annotations"] = annotation_sets
 
         else:
+            # Gets pre-existing annotations
+            annotation_sets = dict(self.data["annotation_sets"]) if "annotation_sets" in self.data else {}
             # Format for JSON in line with GATE formatting
-            annotation_sets = {}
             for annotation in annotations:
                 a_data = annotation.data
                 anonymized_name = f"{settings.ANONYMIZATION_PREFIX}{annotation.user.id}"
@@ -1024,14 +1028,13 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True):
                             "end": 0,
                             "id": 0,
                             "duration_seconds": annotation.time_to_complete,
-                            "features": {
-                                "label": a_data
-                            }
+                            "features": a_data
                         }
                     ],
                     "next_annid": 1,
                 }
                 annotation_sets[anonymized_name if anonymize else annotation.user.username] = annotation_set
+
             doc_dict["annotation_sets"] = annotation_sets
 
         return doc_dict

diff --git a/backend/tests/test_models.py b/backend/tests/test_models.py
@@ -1115,8 +1115,52 @@ def setUp(self):
                         "gate_format_feature1": "Gate feature test value",
                         "gate_format_feature2": "Gate feature test value",
                         "gate_format_feature3": "Gate feature test value",
+                    },
+                    "offset_type": "x",
+                    "annotations": {
+                        "existing_annotator1": {
+                            "sentiment": "positive"
+                        },
+                        f"2": {
+                            "sentiment": "positive"
+                        }
+
+                    },
+                    "annotation_sets": {
+                        "existing_annotator1": {
+                            "name": "existing_annotator1",
+                            "annotations": [
+                                {
+                                    "type": "Document",
+                                    "start": 0,
+                                    "end": 10,
+                                    "id": 0,
+                                    "features": {
+                                        "sentiment": "positive"
+                                    }
+                                }
+                            ],
+                            "next_annid": 1
+                        },
+                        f"{settings.ANONYMIZATION_PREFIX}2": {
+                            "name": f"{settings.ANONYMIZATION_PREFIX}1",
+                            "annotations": [
+                                {
+                                    "type": "Document",
+                                    "start": 0,
+                                    "end": 10,
+                                    "id": 0,
+                                    "features": {
+                                        "sentiment": "positive"
+                                    }
+                                }
+                            ],
+                            "next_annid": 1
+                        }
+
                     }
 
+
                 }
             )
 
@@ -1161,11 +1205,14 @@ def test_export_raw(self):
             self.assertTrue("feature2" in doc_dict)
             self.assertTrue("feature3" in doc_dict)
             self.assertTrue("features" in doc_dict)
+            self.assertTrue("offset_type" in doc_dict)
+            self.assertTrue("annotations" in doc_dict)
             doc_features = doc_dict["features"]
             self.assertTrue("gate_format_feature1" in doc_features)
             self.assertTrue("gate_format_feature2" in doc_features)
             self.assertTrue("gate_format_feature3" in doc_features)
 
+
             self.check_raw_gate_annotation_formatting(doc_dict)
 
     def test_export_gate(self):
@@ -1178,37 +1225,56 @@ def test_export_gate(self):
 
             self.assertTrue("text" in doc_dict)
             self.assertTrue("features" in doc_dict)
+            self.assertFalse("annotations" in doc_dict)
+            self.assertEqual(doc_dict["offset_type"], "x")
             doc_features = doc_dict["features"]
             self.assertTrue("id" in doc_features)
             self.assertTrue("feature1" in doc_features)
             self.assertTrue("feature2" in doc_features)
             self.assertTrue("feature3" in doc_features)
+            self.assertTrue("annotations" in doc_features)
             self.assertFalse("features" in doc_features, "Double nesting of features field")
+            self.assertFalse("offset_type" in doc_features, "Double nesting of offset_type field")
             self.assertTrue("gate_format_feature1" in doc_features)
             self.assertTrue("gate_format_feature2" in doc_features)
             self.assertTrue("gate_format_feature3" in doc_features)
 
             self.check_raw_gate_annotation_formatting(doc_dict)
 
+    def test_export_gate_with_no_offset_type(self):
+
+        for document in self.project.documents.all():
+            document.data.pop("offset_type")
+
+            doc_dict = document.get_doc_annotation_dict("gate")
+            self.assertEqual(doc_dict["offset_type"], "p", "offset_type should default to p")
+
+
     def check_raw_gate_annotation_formatting(self, doc_dict):
         self.assertTrue("annotation_sets" in doc_dict)
-        self.assertTrue(len(doc_dict["annotation_sets"]) == 3)
+        self.assertEqual(len(doc_dict["annotation_sets"]), 4)
 
         # Test annotation formatting
         for aset_key, aset_data in doc_dict["annotation_sets"].items():
-            self.assertTrue("name" in aset_data)
-            self.assertTrue("annotations" in aset_data)
-            self.assertEqual(len(aset_data["annotations"]), 1)
-            anno_dict = aset_data["annotations"][0]
-            self.assertTrue("type" in anno_dict)
-            self.assertTrue("start" in anno_dict)
-            self.assertTrue("end" in anno_dict)
-            self.assertTrue("id" in anno_dict)
-            self.assertTrue("features" in anno_dict)
-            self.assertTrue("label" in anno_dict["features"])
-            label_dict = anno_dict["features"]["label"]
-            self.assertTrue("text1" in label_dict)
-            self.assertTrue("checkbox1" in label_dict)
+            if aset_key != "existing_annotator1":
+                self.assertTrue("name" in aset_data)
+                self.assertTrue("annotations" in aset_data)
+                self.assertEqual(len(aset_data["annotations"]), 1)
+                anno_dict = aset_data["annotations"][0]
+                self.assertTrue("type" in anno_dict)
+                self.assertTrue("start" in anno_dict)
+                self.assertTrue("end" in anno_dict)
+                self.assertTrue("id" in anno_dict)
+                self.assertTrue("features" in anno_dict)
+                features_dict = anno_dict["features"]
+                self.assertTrue("text1" in features_dict)
+                self.assertTrue("checkbox1" in features_dict)
+            else:
+                # Check that existing annotation from document upload is carried over
+                self.assertEqual(aset_data["annotations"][0]["features"]["sentiment"], "positive")
+
+
+
 
     def test_export_csv(self):
 
@@ -1222,11 +1288,14 @@ def test_export_csv(self):
             self.assertTrue("feature2" in doc_dict)
             self.assertTrue("feature3" in doc_dict)
             self.assertTrue("annotations" in doc_dict)
-            self.assertTrue(len(doc_dict["annotations"]) == 3)
+            self.assertEqual(len(doc_dict["annotations"]), 4)
             anno_set_dict = doc_dict["annotations"]
             for set_key in anno_set_dict:
-                self.assertTrue(isinstance(anno_set_dict[set_key]["text1"], str))
-                self.assertTrue(isinstance(anno_set_dict[set_key]["checkbox1"], str))
+                if set_key != "existing_annotator1":
+                    self.assertTrue(isinstance(anno_set_dict[set_key]["text1"], str))
+                    self.assertTrue(isinstance(anno_set_dict[set_key]["checkbox1"], str))
+                else:
+                    self.assertEqual(anno_set_dict[set_key]["sentiment"], "positive")
 
     def test_export_raw_anonymized(self):