CambioML · CambioML · Oct 22, 2024 · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024
@@ -13,6 +13,7 @@
     ModelType,
     check_file_type_and_path,
     check_model,
+    check_resume_extract_type,
     upload_file_to_presigned_url,
 )
 
@@ -42,6 +43,7 @@ def __init__(self, api_key: str, base_url: str = PUBLIC_SHARED_BASE_URL) -> None
         """
         self._sync_extract_url = f"{base_url}/extract"
         self._sync_json_url = f"{base_url}/json/extract"
+        self._sync_resume_url = f"{base_url}/resume/extract"
         self._sync_refined_url = f"{base_url}/refined_parse"
         self._async_upload_url = f"{base_url}/async/upload"
         self._async_fetch_url = f"{base_url}/async/fetch"
@@ -187,6 +189,74 @@ def extract_key_value(
         else:
             return f"Error: {response.status_code} {response.text}", None
 
+    def extract_resume_key_value(
+        self,
+        file_path: str,
+        extract_type: Dict,
+    ) -> Tuple[str, str]:
+        """Extract resume in real-time.
+
+        Args:
+            file_path (str): The path to the file to be parsed.
+            extract_type (str): The type of extraction to be performed. It can be one of the following:
+                - "education": Education
+                - "work_experience": Work Experience
+                - "personal_info": Personal Information
+                - "skills": Skills
+                - "certifications": Certifications
+                - "projects": Projects
+                - "pii": Personally Identifiable Information - includes only name, email, and phone
+        Returns:
+            tuple(str, str): The extracted data and the time taken.
+        """
+        file_extension = Path(file_path).suffix.lower().lstrip(".")
+
+        # Check if the file exists and file_type
+        error = check_file_type_and_path(file_path, file_extension)
+        if error:
+            return error, None
+
+        error = check_resume_extract_type(extract_type)
+        if error:
+            return error, None
+
+        # Encode the file content in base64
+        with open(file_path, "rb") as file:
+            encoded_file = base64.b64encode(file.read()).decode("utf-8")
+
+        # Create the JSON payload
+        payload = {
+            "file_content": encoded_file,
+            "file_type": file_extension,
+            "extract_type": extract_type,
+        }
+
+        # Send the POST request
+        start_time = time.time()
+        response = requests.post(
+            self._sync_resume_url,
+            headers=self._headers,
+            data=json.dumps(payload),
+            timeout=TIMEOUT,
+        )
+        end_time = time.time()
+
+        # Check if the request was successful
+        if response.status_code == 200:
+            try:
+                response_data = response.json()
+                result = response_data["extraction_result"]
+                if extract_type in result:
+                    result = result[extract_type]
+                return (
+                    result,
+                    f"Time Elapsed: {end_time - start_time:.2f} seconds",
+                )
+            except json.JSONDecodeError:
+                return f"Error: Invalid JSON response: {response.text}", None
+        else:
+            return f"Error: {response.status_code} {response.text}", None
+
     def async_extract(
         self,
         file_path: str,
@@ -286,6 +356,59 @@ def async_extract_key_value(
         # If response successful, upload the file
         return upload_file_to_presigned_url(file_path, response)
 
+    def async_extract_resume_key_value(
+        self,
+        file_path: str,
+        extract_type: str,
+    ) -> str:
+        """Extract key-value pairs from a resume asynchronously.
+
+        Args:
+            file_path (str): The path to the file to be parsed.
+            extract_type (str): The type of extraction to be performed. It can be one of the following:
+                - "education": Education
+                - "work_experience": Work Experience
+                - "personal_info": Personal Information
+                - "skills": Skills
+                - "certifications": Certifications
+                - "projects": Projects
+                - "pii": Personally Identifiable Information - includes only name, email, and phone
+        Returns:
+            str: The file id of the uploaded file.
+        """
+        file_extension = Path(file_path).suffix.lower().lstrip(".")
+
+        # Check if the file exists and file_type
+        error = check_file_type_and_path(file_path, file_extension)
+        if error:
+            return error, None
+
+        error = check_resume_extract_type(extract_type)
+        if error:
+            return error, None
+
+        file_name = Path(file_path).name
+
+        # Create the JSON payload
+        payload = {
+            "file_name": file_name,
+            "process_type": "resume_extract",
+            "extract_args": {
+                "extract_type": extract_type,
+            },
+        }
+
+        # Send the POST request
+        response = requests.post(
+            self._async_upload_url,
+            headers=self._headers,
+            data=json.dumps(payload),
+            timeout=TIMEOUT,
+        )
+
+        # If response successful, upload the file
+        return upload_file_to_presigned_url(file_path, response)
+
     def async_fetch(
         self,
         file_id: str,
@@ -337,6 +460,8 @@ def async_fetch(
             result = response.json()
             if "json" in result:
                 return result["json"]
+            elif "resume_extraction" in result:
+                return result["resume_extraction"]
             elif "markdown" in result:
                 markdown_list = result["markdown"]
                 return "\n".join(markdown_list)

@@ -22,6 +22,16 @@ class ModelType(Enum):
     "gif",
 ]
 
+SUPPORTED_RESUME_EXTRACT_TYPES = [
+    "pii",
+    "education",
+    "work_experience",
+    "personal_info",
+    "skills",
+    "certifications",
+    "projects",
+]
+
 
 def upload_file_to_presigned_url(
     file_path: str, response: requests.Response, timeout: int = 10
@@ -61,3 +71,10 @@ def check_file_type_and_path(file_path, file_extension):
     if file_extension not in SUPPORTED_FILE_EXTENSIONS:
         supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS)
         return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}."
+
+
+def check_resume_extract_type(extract_type):
+    # Check if the extract type is supported for resume_extract
+    if extract_type not in SUPPORTED_RESUME_EXTRACT_TYPES:
+        supported_types = ", ".join(SUPPORTED_RESUME_EXTRACT_TYPES)
+        return f"Error: Unsupported resume extract type: {extract_type}. Supported extract types include {supported_types}."
@@ -8,7 +8,7 @@
 import Levenshtein
 from dotenv import load_dotenv
 
-from tests.test_data import EXTRACT_JSON_TEST_DATA
+from tests.test_data import EXTRACT_JSON_TEST_DATA, EXTRACT_RESUME_TEST_DATA
 
 sys.path.append(".")
 load_dotenv(override=True)
@@ -206,6 +206,67 @@ def test_async_extract_key_value_and_fetch(self):
                 # wait 1 s between requests
                 time.sleep(1)
 
+    def test_sync_extract_resume_key_value(self):
+        """Synchronous Resume Extraction with subtests for different file formats"""
+        for data in EXTRACT_RESUME_TEST_DATA:
+            for extract_type in data["correct_output"]:
+                with self.subTest(
+                    working_file=data["working_file"], extract_type=extract_type
+                ):
+                    # extract
+                    key_value_result, elapsed_time = self.ap.extract_resume_key_value(
+                        data["working_file"], extract_type=extract_type
+                    )
+                    print("\n\n Key Value Result: ")
+                    print(key_value_result)
+                    print("\n\n Correct Output: ")
+                    print(data["correct_output"][extract_type])
+
+                    # TODO: update with proper value checking
+                    # get levenshtein distance from string of correct output vs. key value result
+                    percentage = compare_markdown(
+                        str(key_value_result), str(data["correct_output"][extract_type])
+                    )
+
+                    self.assertGreaterEqual(
+                        percentage,
+                        80,
+                        f"Output similarity too low: {percentage:.2f}%",
+                    )
+
+                    self.assertIn("Time Elapsed", elapsed_time)
+                    # wait 1 s between requests
+                    time.sleep(1)
+
+    def test_async_extract_resume_key_value_and_fetch(self):
+        """Asynchronous Resume Extraction and Fetch"""
+        for data in EXTRACT_RESUME_TEST_DATA:
+            for extract_type in data["correct_output"]:
+                with self.subTest(
+                    working_file=data["working_file"], extract_type=extract_type
+                ):
+                    # extract
+                    file_id = self.ap.async_extract_resume_key_value(
+                        data["working_file"], extract_type=extract_type
+                    )
+                    self.assertFalse(file_id.startswith("Error:"), file_id)
+                    # fetch
+                    extract_resume_result = self.ap.async_fetch(file_id=file_id)
+                    # TODO: update with proper value checking
+                    # get levenshtein distance from string of correct output vs. key value result
+                    percentage = compare_markdown(
+                        str(extract_resume_result),
+                        str(data["correct_output"][extract_type]),
+                    )
+
+                    self.assertGreaterEqual(
+                        percentage,
+                        80,
+                        f"Output similarity too low: {percentage:.2f}%",
+                    )
+                    # wait 1 s between requests
+                    time.sleep(1)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
@@ -76,3 +76,134 @@
         ],
     },
 ]
+
+EXTRACT_RESUME_TEST_DATA = [
+    {
+        "working_file": "./examples/sample_data/test_resume.pdf",
+        "correct_output": {
+            "pii": {
+                "full_name": "John Doe",
+                "email": "johndoe@example.com",
+                "phone": "(123) 456-7890",
+            },
+            "personal_info": {
+                "name": "John Doe",
+                "phone_number": "+1-123-456-7890",
+                "address": "123 Main St, Anytown, USA",
+                "email_address": "johndoe@example.com",
+                "linkedin_url": "linkedin.com/in/johndoe",
+                "github_url": "github.com/johndoe",
+                "summary": "Experienced software developer with a passion for creating innovative solutions and a strong focus on full-stack development. Skilled in a variety of programming languages and frameworks, with a proven track record of delivering high-quality software in fast-paced environments.",
+            },
+            "education": [
+                {
+                    "organization": "University of Anytown",
+                    "degree": "Bachelor of Science",
+                    "major": "Computer Science",
+                    "start_date": "2013-08-01",
+                    "end_date": "2017-05-01",
+                    "courses": [
+                        "Data Structures",
+                        "Algorithms",
+                        "Web Development",
+                        "Cloud Computing",
+                        "Databases",
+                    ],
+                    "achievements": ["GPA: 3.8/4.0"],
+                }
+            ],
+            "work_experience": [
+                {
+                    "job_title": "Senior Software Engineer",
+                    "company_name": "Tech Solutions Corp",
+                    "location": "Anytown, USA",
+                    "start_date": "2022-01-01",
+                    "end_date": "Present",
+                    "job_type": None,
+                    "summary": "Led a team of developers to design and implement scalable microservices architecture.",
+                    "bullet_points": [
+                        "Led a team of 5 developers to design and implement a scalable microservices architecture using Node.js and AWS Lambda.",
+                        "Improved application performance by 30% through code optimization and database tuning.",
+                        "Developed and deployed CI/CD pipelines using GitHub Actions and AWS CodePipeline.",
+                        "Collaborated with product managers to define technical requirements and timelines for new features.",
+                    ],
+                },
+                {
+                    "job_title": "Full-Stack Developer",
+                    "company_name": "Innovative Web Agency",
+                    "location": "Somecity, USA",
+                    "start_date": "2019-08-01",
+                    "end_date": "2021-12-31",
+                    "job_type": None,
+                    "summary": "Built responsive web applications for a variety of clients.",
+                    "bullet_points": [
+                        "Built responsive web applications using React, TypeScript, and Tailwind CSS for a variety of clients.",
+                        "Integrated third-party APIs (Stripe, Twilio) to enhance application functionality.",
+                        "Maintained and updated legacy systems written in PHP and MySQL.",
+                        "Spearheaded the migration of on-premise infrastructure to AWS, reducing hosting costs by 40%.",
+                    ],
+                },
+            ],
+            "skills": {
+                "Programming Languages": [
+                    "JavaScript",
+                    "Typescript",
+                    "Python",
+                    "Java",
+                    "SQL",
+                ],
+                "Tools": [
+                    "Git",
+                    "Github",
+                    "Gitlab",
+                    "Jira",
+                    "Jenkins",
+                ],
+                "Other": [
+                    "AWS",
+                    "Docker",
+                    "Kubernetes",
+                    "Terraform",
+                ],
+            },
+            "certifications": [
+                {
+                    "name": "AWS Certified Solutions Architect – Associate",
+                    "description": "Certification demonstrating knowledge of AWS solutions architecture.",
+                },
+                {
+                    "name": "Certified Kubernetes Administrator (CKA)",
+                    "description": "Certification for Kubernetes administration skills.",
+                },
+                {
+                    "name": "Google Cloud Professional DevOps Engineer",
+                    "description": "Certification validating proficiency in Google Cloud DevOps practices.",
+                },
+            ],
+            "projects": [
+                {
+                    "organization": None,
+                    "project_name": "Project Management App",
+                    "location": None,
+                    "start_date": "2019-08-01",
+                    "end_date": "2021-12-01",
+                    "descriptions": [
+                        "A React-based project management tool that allows teams to track tasks, set deadlines, and collaborate in real-time.",
+                        "Integrated with Firebase for authentication and Firestore for real-time data sync.",
+                    ],
+                },
+                {
+                    "organization": None,
+                    "project_name": "E-Commerce Platform",
+                    "location": None,
+                    "start_date": "2019-08-01",
+                    "end_date": "2018-12-01",
+                    "descriptions": [
+                        "Built a fully functional e-commerce platform using Node.js, Express, and MongoDB.",
+                        "Includes user authentication, product search and filtering, and an admin dashboard for order management.",
+                    ],
+                },
+            ],
+        },
+    },
+]