From 6700681a5d975081ecda72940e09109afaab17ee Mon Sep 17 00:00:00 2001 From: John Taylor Date: Thu, 2 Jan 2025 21:04:06 -0500 Subject: [PATCH] get_identified_elements() will now always return pronouns --- README.md | 9 +++------ deidentification/deidentification.py | 6 +++++- deidentification/deidentification_constants.py | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 394bd14..8704ea1 100644 --- a/README.md +++ b/README.md @@ -29,11 +29,6 @@ Download the required spaCy model: python -m spacy download en_core_web_trf ``` -For debugging, by setting `config.debug=True`, you will also need [VeryPrettyTable](https://github.com/smeggingsmegger/): -```bash -pip install VeryPrettyTable -``` - ## Usage ### Command Line Interface @@ -41,6 +36,8 @@ pip install VeryPrettyTable The package includes a command-line tool for quick de-identification of text files: ```bash +deidentify input_file [options] +# or: python -m deidentification.deidentify input_file [options] ``` @@ -55,7 +52,7 @@ Options: Example: ```bash # De-identify a text file and save with HTML markup -python -m deidentification.deidentify input.txt -H -o output.html -r "[REDACTED]" +deidentify input.txt -H -o output.html -r "[REDACTED]" ``` ### Python API Usage diff --git a/deidentification/deidentification.py b/deidentification/deidentification.py index d901cbe..6087225 100644 --- a/deidentification/deidentification.py +++ b/deidentification/deidentification.py @@ -69,6 +69,9 @@ def __init__(self, config: DeidentificationConfig = DeidentificationConfig()): # this combines all self.all_persons lists from multiple passes of self._find_all_persons() self.aggregate_persons: list[dict] = [] + # this combines all self.all_pronouns lists from multiple loop iterations in self.deidentify() + self.aggregate_pronouns: list[dict] = [] + self.all_pronouns: list[dict] = [] self.doc: Optional[Doc] = None self.table_class = None @@ -139,6 +142,7 @@ def deidentify(self, text: str) -> str: self.__debug_log(f"deidentify(): next iter, persons={len(self.all_persons)}") if persons_count == 0: break + self.aggregate_pronouns.extend(self.all_pronouns) self.all_pronouns = [] merged = self._merge_metadata() replaced_text = self._replace_merged(replaced_text, merged) @@ -167,7 +171,7 @@ def deidentify_with_wrapped_html(self, text: str, html_begin: str = HTML_BEGIN, return buffer.getvalue() def get_identified_elements(self) -> dict: - elements = {"message": self.replaced_text, "entities": self.aggregate_persons, "pronouns": self.all_pronouns} + elements = {"message": self.replaced_text, "entities": self.aggregate_persons, "pronouns": self.aggregate_pronouns} return elements def _find_all_persons(self) -> int: diff --git a/deidentification/deidentification_constants.py b/deidentification/deidentification_constants.py index 0ca4bb3..5099d0d 100644 --- a/deidentification/deidentification_constants.py +++ b/deidentification/deidentification_constants.py @@ -1,6 +1,6 @@ pgmName = "deidentification" pgmUrl = "https://github.com/jftuga/deidentification" -pgmVersion = "1.1.1" +pgmVersion = "1.1.2" GENDER_PRONOUNS = { "he": "HE/SHE",