From 9901253f465f5db4a7d75e645e493f92c1a66a7c Mon Sep 17 00:00:00 2001
From: UrbanGISer <piantu@hotmail.com>
Date: Tue, 14 Nov 2023 17:30:41 -0500
Subject: [PATCH 01/12] Add Socrata Data Nodes

---
 knime_extension/geospatial_env.yml    |   1 +
 knime_extension/src/nodes/opendata.py | 157 ++++++++++++++++++++++++++
 2 files changed, 158 insertions(+)

diff --git a/knime_extension/geospatial_env.yml b/knime_extension/geospatial_env.yml
index 46a6a72d..fb95bb2d 100644
--- a/knime_extension/geospatial_env.yml
+++ b/knime_extension/geospatial_env.yml
@@ -35,3 +35,4 @@ dependencies:
   - pointpats=2.3.0
   - pip:
     - ipinfo==4.4.3
+    - sodapy==2.2.0
diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py
index b9a47de3..9a370f48 100644
--- a/knime_extension/src/nodes/opendata.py
+++ b/knime_extension/src/nodes/opendata.py
@@ -686,3 +686,160 @@ def execute(self, exec_context: knext.ExecutionContext):
         gdf = get_osmnx().geocoder.geocode_to_gdf(self.placename)
         gdf = gdf.reset_index(drop=True)
         return knext.Table.from_pandas(gdf)
+
+
+############################################
+# Socrata Search
+############################################
+@knext.node(
+    name="Socrata Search",
+    node_type=knext.NodeType.SOURCE,
+    icon_path=__NODE_ICON_PATH + "Socrata Search.png",
+    category=__category,
+    after="",
+)
+@knext.output_table(
+    name="Socrata dataset list",
+    description="Socrata dataset based on search keywords",
+)
+class SocrataSearchNode:
+    """Retrive the open data category via Socrata API.
+
+    The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs..
+    This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset list.
+    """
+
+    queryitem = knext.StringParameter(
+        label="Input searching item",
+        description="""Enter search keywords or dataset names to find relevant datasets in the Socrata database. 
+                   This search is not case-sensitive and can include multiple words separated by spaces. """,
+        default_value="Massachusetts",
+    )
+
+    def configure(self, configure_context):
+        # TODO Create combined schema
+        return None
+
+    def execute(self, exec_context: knext.ExecutionContext):
+        from urllib.request import Request, urlopen
+        import pandas as pd
+        import json
+        from pandas import json_normalize
+
+        query_item = self.queryitem
+        request = Request(
+            f"http://api.us.socrata.com/api/catalog/v1?q={query_item}&only=datasets&limit=10000"
+        )
+
+        response = urlopen(request)
+        response_body = response.read()
+
+        # Load the JSON response into a Python dictionary
+        data = json.loads(response_body)
+
+        # Extract the "results" key, which contains the dataset information
+        dataset_info = data["results"]
+
+        # Create a DataFrame from the dataset information, and flatten the nested dictionaries
+        df = json_normalize(dataset_info)
+        df = df.drop(
+            columns=["classification.domain_tags", "classification.domain_metadata"]
+        )
+
+        # Find List
+        list_columns = [
+            col for col in df.columns if any(isinstance(item, list) for item in df[col])
+        ]
+
+        # Drop error list column
+        for col in list_columns:
+            try:
+                df[col] = df[col].apply(
+                    lambda x: ", ".join(x) if isinstance(x, list) else x
+                )
+            except Exception as e:
+                df.drop(columns=[col], inplace=True)
+
+        # Drop columns that cannot be saved in KNIME
+        drop_columns = []
+        for col in df.columns:
+            try:
+                # Attempt to convert the column to a KNIME-compatible data type
+                knime_table = knext.Table.from_pandas(df[[col]])
+            except Exception as e:
+                # If an exception is raised, add the column to the list of columns to drop
+                drop_columns.append(col)
+
+        # Drop the columns that cannot be saved in KNIME
+        df.drop(columns=drop_columns, inplace=True)
+        df.replace("?", pd.NA, inplace=True)
+        df.replace("", pd.NA, inplace=True)
+        df.dropna(axis=1, how="all", inplace=True)
+        df = df.reset_index(drop=True)
+        return knext.Table.from_pandas(df)
+
+
+############################################
+# Socrata Data Query
+############################################
+@knext.node(
+    name="Socrata Data Query",
+    node_type=knext.NodeType.SOURCE,
+    icon_path=__NODE_ICON_PATH + "Socrata Data Query.png",
+    category=__category,
+    after="",
+)
+@knext.output_table(
+    name="Socrata dataset",
+    description="Socrata dataset based on search keywords",
+)
+class SocrataDataNode:
+    """Retrive the open data category via Socrata API.
+
+    The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs..
+    This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset from a dataset list generated by Socrata Search Node.
+
+    For instance, this dataset [Incidence Rate Of Breast Cancer](https://opendata.utah.gov/Health/Incidence-Rate-Of-Breast-Cancer-Per-100-000-All-St/q22t-rbk9) has a resource_id of "q22t-rbk9"  and a metadata domain of "opendata.utah.gov".
+    They can be found in the link under API,"https://opendata.utah.gov/resource/q22t-rbk9.json".  Both the two items will be used for data retriving.
+    """
+
+    metadata_domain = knext.StringParameter(
+        label="Metadata domain",
+        description="""The value in the column metadata.domain of a table generated by a Socrata Search node. """,
+        default_value="",
+    )
+
+    resource_id = knext.StringParameter(
+        label="Resource ID",
+        description="""The value in the column resource.id of a table generated by a Socrata Search node. """,
+        default_value="",
+    )
+
+    def configure(self, configure_context):
+        # TODO Create combined schema
+        return None
+
+    def execute(self, exec_context: knext.ExecutionContext):
+        import pandas as pd
+        import json
+        import pandas as pd
+        from sodapy import Socrata
+
+        # Unauthenticated client only works with public data sets. Note 'None'
+        # in place of application token, and no username or password:
+        client = Socrata(self.metadata_domain, None)
+
+        # Example authenticated client (needed for non-public datasets):
+        # client = Socrata(data.cdc.gov,
+        #                  MyAppToken,
+        #                  username="user@example.com",
+        #                  password="AFakePassword")
+
+        # First 2000 results, returned as JSON from API / converted to Python list of
+        # dictionaries by sodapy.
+        results = client.get(self.resource_id, limit=100000)
+
+        # Convert to pandas DataFrame
+        results_df = pd.DataFrame.from_records(results)
+
+        return knext.Table.from_pandas(results_df)

From 29de8d32f1a58da8073d6556d0fa0a5df7a3bd32 Mon Sep 17 00:00:00 2001
From: Lingbo Liu <piantu@hotmail.com>
Date: Tue, 5 Mar 2024 08:05:02 -0500
Subject: [PATCH 02/12] revise socrata

---
 knime_extension/src/nodes/opendata.py | 66 +++++++++++++++++++++------
 1 file changed, 53 insertions(+), 13 deletions(-)

diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py
index 9a370f48..dd32fa1f 100644
--- a/knime_extension/src/nodes/opendata.py
+++ b/knime_extension/src/nodes/opendata.py
@@ -700,11 +700,26 @@ def execute(self, exec_context: knext.ExecutionContext):
 )
 @knext.output_table(
     name="Socrata dataset list",
-    description="Socrata dataset based on search keywords",
+    description="Socrata dataset list from  a wealth of open data resources from governments, non-profits, and NGOs around the world based on the query term. ",
 )
 class SocrataSearchNode:
-    """Retrive the open data category via Socrata API.
-
+    """Access open datasets from various well-known data resources and organizations effortlessly  using the SODA interface. 
+    
+    US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, 
+    injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, 
+    epidemiological studies, and collaborative research efforts conducted by the CDC and its partners.
+
+    Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, 
+    healthcare, transportation, and the environment.
+
+    Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more.
+    
+    NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city.
+    
+    UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more.
+    
+    World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends.
+       
     The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs..
     This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset list.
     """
@@ -725,10 +740,12 @@ def execute(self, exec_context: knext.ExecutionContext):
         import pandas as pd
         import json
         from pandas import json_normalize
+        from urllib.parse import quote
 
         query_item = self.queryitem
+        encoded_query_item = quote(query_item)
         request = Request(
-            f"http://api.us.socrata.com/api/catalog/v1?q={query_item}&only=datasets&limit=10000"
+            f"http://api.us.socrata.com/api/catalog/v1?q={encoded_query_item}&only=datasets&limit=10000"
         )
 
         response = urlopen(request)
@@ -742,9 +759,11 @@ def execute(self, exec_context: knext.ExecutionContext):
 
         # Create a DataFrame from the dataset information, and flatten the nested dictionaries
         df = json_normalize(dataset_info)
-        df = df.drop(
-            columns=["classification.domain_tags", "classification.domain_metadata"]
-        )
+        # Check if columns exist before dropping them
+        columns_to_drop = ["classification.domain_tags", "classification.domain_metadata"]
+        columns_to_drop = [col for col in columns_to_drop if col in df.columns]
+        df = df.drop(columns=columns_to_drop)
+
 
         # Find List
         list_columns = [
@@ -794,8 +813,23 @@ def execute(self, exec_context: knext.ExecutionContext):
     description="Socrata dataset based on search keywords",
 )
 class SocrataDataNode:
-    """Retrive the open data category via Socrata API.
-
+    """Retrieve the open data category via Socrata API.
+
+    US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, 
+    injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, 
+    epidemiological studies, and collaborative research efforts conducted by the CDC and its partners.
+
+    Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, 
+    healthcare, transportation, and the environment.
+
+    Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more.
+    
+    NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city.
+    
+    UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more.
+    
+    World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends.   
+  
     The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs..
     This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset from a dataset list generated by Socrata Search Node.
 
@@ -828,7 +862,15 @@ def execute(self, exec_context: knext.ExecutionContext):
         # Unauthenticated client only works with public data sets. Note 'None'
         # in place of application token, and no username or password:
         client = Socrata(self.metadata_domain, None)
-
+        limit = 100000 
+        offset = 0
+        all_results = []
+        while True:
+            results = client.get(self.resource_id, limit=limit, offset=offset)
+            if not results:
+                break
+            all_results.extend(results)
+            offset += limit
         # Example authenticated client (needed for non-public datasets):
         # client = Socrata(data.cdc.gov,
         #                  MyAppToken,
@@ -837,9 +879,7 @@ def execute(self, exec_context: knext.ExecutionContext):
 
         # First 2000 results, returned as JSON from API / converted to Python list of
         # dictionaries by sodapy.
-        results = client.get(self.resource_id, limit=100000)
-
         # Convert to pandas DataFrame
-        results_df = pd.DataFrame.from_records(results)
+        results_df = pd.DataFrame.from_records(all_results)
 
         return knext.Table.from_pandas(results_df)

From 6e512200dff86fb954ea55a8212741081464138b Mon Sep 17 00:00:00 2001
From: UrbanGISer <piantu@hotmail.com>
Date: Tue, 14 Nov 2023 17:30:41 -0500
Subject: [PATCH 03/12] Add Socrata Data Nodes

---
 knime_extension/geospatial_env.yml    |   1 +
 knime_extension/src/nodes/opendata.py | 157 ++++++++++++++++++++++++++
 2 files changed, 158 insertions(+)

diff --git a/knime_extension/geospatial_env.yml b/knime_extension/geospatial_env.yml
index a549339f..844dada7 100644
--- a/knime_extension/geospatial_env.yml
+++ b/knime_extension/geospatial_env.yml
@@ -36,3 +36,4 @@ dependencies:
   - pip:
     - ipinfo==4.4.3
     - pulp==2.7.0
+    - sodapy==2.2.0
diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py
index d7032034..e09b7dc9 100644
--- a/knime_extension/src/nodes/opendata.py
+++ b/knime_extension/src/nodes/opendata.py
@@ -878,3 +878,160 @@ def execute(self, exec_context: knext.ExecutionContext):
             crs="EPSG:4326",
         )
         return knext.Table.from_pandas(gdf)
+
+
+############################################
+# Socrata Search
+############################################
+@knext.node(
+    name="Socrata Search",
+    node_type=knext.NodeType.SOURCE,
+    icon_path=__NODE_ICON_PATH + "Socrata Search.png",
+    category=__category,
+    after="",
+)
+@knext.output_table(
+    name="Socrata dataset list",
+    description="Socrata dataset based on search keywords",
+)
+class SocrataSearchNode:
+    """Retrive the open data category via Socrata API.
+
+    The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs..
+    This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset list.
+    """
+
+    queryitem = knext.StringParameter(
+        label="Input searching item",
+        description="""Enter search keywords or dataset names to find relevant datasets in the Socrata database. 
+                   This search is not case-sensitive and can include multiple words separated by spaces. """,
+        default_value="Massachusetts",
+    )
+
+    def configure(self, configure_context):
+        # TODO Create combined schema
+        return None
+
+    def execute(self, exec_context: knext.ExecutionContext):
+        from urllib.request import Request, urlopen
+        import pandas as pd
+        import json
+        from pandas import json_normalize
+
+        query_item = self.queryitem
+        request = Request(
+            f"http://api.us.socrata.com/api/catalog/v1?q={query_item}&only=datasets&limit=10000"
+        )
+
+        response = urlopen(request)
+        response_body = response.read()
+
+        # Load the JSON response into a Python dictionary
+        data = json.loads(response_body)
+
+        # Extract the "results" key, which contains the dataset information
+        dataset_info = data["results"]
+
+        # Create a DataFrame from the dataset information, and flatten the nested dictionaries
+        df = json_normalize(dataset_info)
+        df = df.drop(
+            columns=["classification.domain_tags", "classification.domain_metadata"]
+        )
+
+        # Find List
+        list_columns = [
+            col for col in df.columns if any(isinstance(item, list) for item in df[col])
+        ]
+
+        # Drop error list column
+        for col in list_columns:
+            try:
+                df[col] = df[col].apply(
+                    lambda x: ", ".join(x) if isinstance(x, list) else x
+                )
+            except Exception as e:
+                df.drop(columns=[col], inplace=True)
+
+        # Drop columns that cannot be saved in KNIME
+        drop_columns = []
+        for col in df.columns:
+            try:
+                # Attempt to convert the column to a KNIME-compatible data type
+                knime_table = knext.Table.from_pandas(df[[col]])
+            except Exception as e:
+                # If an exception is raised, add the column to the list of columns to drop
+                drop_columns.append(col)
+
+        # Drop the columns that cannot be saved in KNIME
+        df.drop(columns=drop_columns, inplace=True)
+        df.replace("?", pd.NA, inplace=True)
+        df.replace("", pd.NA, inplace=True)
+        df.dropna(axis=1, how="all", inplace=True)
+        df = df.reset_index(drop=True)
+        return knext.Table.from_pandas(df)
+
+
+############################################
+# Socrata Data Query
+############################################
+@knext.node(
+    name="Socrata Data Query",
+    node_type=knext.NodeType.SOURCE,
+    icon_path=__NODE_ICON_PATH + "Socrata Data Query.png",
+    category=__category,
+    after="",
+)
+@knext.output_table(
+    name="Socrata dataset",
+    description="Socrata dataset based on search keywords",
+)
+class SocrataDataNode:
+    """Retrive the open data category via Socrata API.
+
+    The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs..
+    This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset from a dataset list generated by Socrata Search Node.
+
+    For instance, this dataset [Incidence Rate Of Breast Cancer](https://opendata.utah.gov/Health/Incidence-Rate-Of-Breast-Cancer-Per-100-000-All-St/q22t-rbk9) has a resource_id of "q22t-rbk9"  and a metadata domain of "opendata.utah.gov".
+    They can be found in the link under API,"https://opendata.utah.gov/resource/q22t-rbk9.json".  Both the two items will be used for data retriving.
+    """
+
+    metadata_domain = knext.StringParameter(
+        label="Metadata domain",
+        description="""The value in the column metadata.domain of a table generated by a Socrata Search node. """,
+        default_value="",
+    )
+
+    resource_id = knext.StringParameter(
+        label="Resource ID",
+        description="""The value in the column resource.id of a table generated by a Socrata Search node. """,
+        default_value="",
+    )
+
+    def configure(self, configure_context):
+        # TODO Create combined schema
+        return None
+
+    def execute(self, exec_context: knext.ExecutionContext):
+        import pandas as pd
+        import json
+        import pandas as pd
+        from sodapy import Socrata
+
+        # Unauthenticated client only works with public data sets. Note 'None'
+        # in place of application token, and no username or password:
+        client = Socrata(self.metadata_domain, None)
+
+        # Example authenticated client (needed for non-public datasets):
+        # client = Socrata(data.cdc.gov,
+        #                  MyAppToken,
+        #                  username="user@example.com",
+        #                  password="AFakePassword")
+
+        # First 2000 results, returned as JSON from API / converted to Python list of
+        # dictionaries by sodapy.
+        results = client.get(self.resource_id, limit=100000)
+
+        # Convert to pandas DataFrame
+        results_df = pd.DataFrame.from_records(results)
+
+        return knext.Table.from_pandas(results_df)

From 103cbcd14d42702bbb3144e60d949832d1b07c91 Mon Sep 17 00:00:00 2001
From: Lingbo Liu <piantu@hotmail.com>
Date: Tue, 5 Mar 2024 08:05:02 -0500
Subject: [PATCH 04/12] revise socrata

---
 knime_extension/src/nodes/opendata.py | 66 +++++++++++++++++++++------
 1 file changed, 53 insertions(+), 13 deletions(-)

diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py
index e09b7dc9..102fd90a 100644
--- a/knime_extension/src/nodes/opendata.py
+++ b/knime_extension/src/nodes/opendata.py
@@ -892,11 +892,26 @@ def execute(self, exec_context: knext.ExecutionContext):
 )
 @knext.output_table(
     name="Socrata dataset list",
-    description="Socrata dataset based on search keywords",
+    description="Socrata dataset list from  a wealth of open data resources from governments, non-profits, and NGOs around the world based on the query term. ",
 )
 class SocrataSearchNode:
-    """Retrive the open data category via Socrata API.
-
+    """Access open datasets from various well-known data resources and organizations effortlessly  using the SODA interface. 
+    
+    US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, 
+    injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, 
+    epidemiological studies, and collaborative research efforts conducted by the CDC and its partners.
+
+    Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, 
+    healthcare, transportation, and the environment.
+
+    Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more.
+    
+    NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city.
+    
+    UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more.
+    
+    World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends.
+       
     The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs..
     This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset list.
     """
@@ -917,10 +932,12 @@ def execute(self, exec_context: knext.ExecutionContext):
         import pandas as pd
         import json
         from pandas import json_normalize
+        from urllib.parse import quote
 
         query_item = self.queryitem
+        encoded_query_item = quote(query_item)
         request = Request(
-            f"http://api.us.socrata.com/api/catalog/v1?q={query_item}&only=datasets&limit=10000"
+            f"http://api.us.socrata.com/api/catalog/v1?q={encoded_query_item}&only=datasets&limit=10000"
         )
 
         response = urlopen(request)
@@ -934,9 +951,11 @@ def execute(self, exec_context: knext.ExecutionContext):
 
         # Create a DataFrame from the dataset information, and flatten the nested dictionaries
         df = json_normalize(dataset_info)
-        df = df.drop(
-            columns=["classification.domain_tags", "classification.domain_metadata"]
-        )
+        # Check if columns exist before dropping them
+        columns_to_drop = ["classification.domain_tags", "classification.domain_metadata"]
+        columns_to_drop = [col for col in columns_to_drop if col in df.columns]
+        df = df.drop(columns=columns_to_drop)
+
 
         # Find List
         list_columns = [
@@ -986,8 +1005,23 @@ def execute(self, exec_context: knext.ExecutionContext):
     description="Socrata dataset based on search keywords",
 )
 class SocrataDataNode:
-    """Retrive the open data category via Socrata API.
-
+    """Retrieve the open data category via Socrata API.
+
+    US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, 
+    injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, 
+    epidemiological studies, and collaborative research efforts conducted by the CDC and its partners.
+
+    Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, 
+    healthcare, transportation, and the environment.
+
+    Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more.
+    
+    NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city.
+    
+    UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more.
+    
+    World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends.   
+  
     The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs..
     This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset from a dataset list generated by Socrata Search Node.
 
@@ -1020,7 +1054,15 @@ def execute(self, exec_context: knext.ExecutionContext):
         # Unauthenticated client only works with public data sets. Note 'None'
         # in place of application token, and no username or password:
         client = Socrata(self.metadata_domain, None)
-
+        limit = 100000 
+        offset = 0
+        all_results = []
+        while True:
+            results = client.get(self.resource_id, limit=limit, offset=offset)
+            if not results:
+                break
+            all_results.extend(results)
+            offset += limit
         # Example authenticated client (needed for non-public datasets):
         # client = Socrata(data.cdc.gov,
         #                  MyAppToken,
@@ -1029,9 +1071,7 @@ def execute(self, exec_context: knext.ExecutionContext):
 
         # First 2000 results, returned as JSON from API / converted to Python list of
         # dictionaries by sodapy.
-        results = client.get(self.resource_id, limit=100000)
-
         # Convert to pandas DataFrame
-        results_df = pd.DataFrame.from_records(results)
+        results_df = pd.DataFrame.from_records(all_results)
 
         return knext.Table.from_pandas(results_df)

From a9923a34772786bb757eb5ce496268ada2d713c3 Mon Sep 17 00:00:00 2001
From: Lingbo Liu <piantu@hotmail.com>
Date: Thu, 6 Feb 2025 14:24:32 -0500
Subject: [PATCH 05/12] add time out  parameters

---
 knime_extension/src/nodes/opendata.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py
index e4beb6dd..70c16c54 100644
--- a/knime_extension/src/nodes/opendata.py
+++ b/knime_extension/src/nodes/opendata.py
@@ -923,6 +923,14 @@ class SocrataSearchNode:
         default_value="Massachusetts",
     )
 
+    timeout = knext.IntParameter(
+        label="Request timeout in seconds",
+        description="The timeout in seconds for the request API.",
+        default_value=120,
+        min_value=1,
+        is_advanced=True,
+    )
+
     def configure(self, configure_context):
         # TODO Create combined schema
         return None
@@ -940,7 +948,7 @@ def execute(self, exec_context: knext.ExecutionContext):
             f"http://api.us.socrata.com/api/catalog/v1?q={encoded_query_item}&only=datasets&limit=10000"
         )
 
-        response = urlopen(request)
+        response = urlopen(request, timeout=self.timeout)
         response_body = response.read()
 
         # Load the JSON response into a Python dictionary
@@ -1043,7 +1051,15 @@ class SocrataDataNode:
         default_value="",
     )
 
-    def configure(self, configure_context):
+    timeout = knext.IntParameter(
+        label="Request timeout in seconds",
+        description="The timeout in seconds for the request API.",
+        default_value=120,
+        min_value=1,
+        is_advanced=True,
+    )
+
+    def configure(self, configure_context, input_schema_1):
         # TODO Create combined schema
         return None
 
@@ -1056,6 +1072,7 @@ def execute(self, exec_context: knext.ExecutionContext):
         # Unauthenticated client only works with public data sets. Note 'None'
         # in place of application token, and no username or password:
         client = Socrata(self.metadata_domain, None)
+        client.timeout = self.timeout
         limit = 100000
         offset = 0
         all_results = []

From b625a6f4052a3216c48a57a731e9ec377e26a976 Mon Sep 17 00:00:00 2001
From: UrbanGISer <piantu@hotmail.com>
Date: Tue, 14 Nov 2023 17:30:41 -0500
Subject: [PATCH 06/12] Add Socrata Data Nodes

---
 knime_extension/geospatial_env.yml    |   1 +
 knime_extension/src/nodes/opendata.py | 157 ++++++++++++++++++++++++++
 2 files changed, 158 insertions(+)

diff --git a/knime_extension/geospatial_env.yml b/knime_extension/geospatial_env.yml
index a549339f..844dada7 100644
--- a/knime_extension/geospatial_env.yml
+++ b/knime_extension/geospatial_env.yml
@@ -36,3 +36,4 @@ dependencies:
   - pip:
     - ipinfo==4.4.3
     - pulp==2.7.0
+    - sodapy==2.2.0
diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py
index d7032034..e09b7dc9 100644
--- a/knime_extension/src/nodes/opendata.py
+++ b/knime_extension/src/nodes/opendata.py
@@ -878,3 +878,160 @@ def execute(self, exec_context: knext.ExecutionContext):
             crs="EPSG:4326",
         )
         return knext.Table.from_pandas(gdf)
+
+
+############################################
+# Socrata Search
+############################################
+@knext.node(
+    name="Socrata Search",
+    node_type=knext.NodeType.SOURCE,
+    icon_path=__NODE_ICON_PATH + "Socrata Search.png",
+    category=__category,
+    after="",
+)
+@knext.output_table(
+    name="Socrata dataset list",
+    description="Socrata dataset based on search keywords",
+)
+class SocrataSearchNode:
+    """Retrive the open data category via Socrata API.
+
+    The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs..
+    This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset list.
+    """
+
+    queryitem = knext.StringParameter(
+        label="Input searching item",
+        description="""Enter search keywords or dataset names to find relevant datasets in the Socrata database. 
+                   This search is not case-sensitive and can include multiple words separated by spaces. """,
+        default_value="Massachusetts",
+    )
+
+    def configure(self, configure_context):
+        # TODO Create combined schema
+        return None
+
+    def execute(self, exec_context: knext.ExecutionContext):
+        from urllib.request import Request, urlopen
+        import pandas as pd
+        import json
+        from pandas import json_normalize
+
+        query_item = self.queryitem
+        request = Request(
+            f"http://api.us.socrata.com/api/catalog/v1?q={query_item}&only=datasets&limit=10000"
+        )
+
+        response = urlopen(request)
+        response_body = response.read()
+
+        # Load the JSON response into a Python dictionary
+        data = json.loads(response_body)
+
+        # Extract the "results" key, which contains the dataset information
+        dataset_info = data["results"]
+
+        # Create a DataFrame from the dataset information, and flatten the nested dictionaries
+        df = json_normalize(dataset_info)
+        df = df.drop(
+            columns=["classification.domain_tags", "classification.domain_metadata"]
+        )
+
+        # Find List
+        list_columns = [
+            col for col in df.columns if any(isinstance(item, list) for item in df[col])
+        ]
+
+        # Drop error list column
+        for col in list_columns:
+            try:
+                df[col] = df[col].apply(
+                    lambda x: ", ".join(x) if isinstance(x, list) else x
+                )
+            except Exception as e:
+                df.drop(columns=[col], inplace=True)
+
+        # Drop columns that cannot be saved in KNIME
+        drop_columns = []
+        for col in df.columns:
+            try:
+                # Attempt to convert the column to a KNIME-compatible data type
+                knime_table = knext.Table.from_pandas(df[[col]])
+            except Exception as e:
+                # If an exception is raised, add the column to the list of columns to drop
+                drop_columns.append(col)
+
+        # Drop the columns that cannot be saved in KNIME
+        df.drop(columns=drop_columns, inplace=True)
+        df.replace("?", pd.NA, inplace=True)
+        df.replace("", pd.NA, inplace=True)
+        df.dropna(axis=1, how="all", inplace=True)
+        df = df.reset_index(drop=True)
+        return knext.Table.from_pandas(df)
+
+
+############################################
+# Socrata Data Query
+############################################
+@knext.node(
+    name="Socrata Data Query",
+    node_type=knext.NodeType.SOURCE,
+    icon_path=__NODE_ICON_PATH + "Socrata Data Query.png",
+    category=__category,
+    after="",
+)
+@knext.output_table(
+    name="Socrata dataset",
+    description="Socrata dataset based on search keywords",
+)
+class SocrataDataNode:
+    """Retrive the open data category via Socrata API.
+
+    The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs..
+    This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset from a dataset list generated by Socrata Search Node.
+
+    For instance, this dataset [Incidence Rate Of Breast Cancer](https://opendata.utah.gov/Health/Incidence-Rate-Of-Breast-Cancer-Per-100-000-All-St/q22t-rbk9) has a resource_id of "q22t-rbk9"  and a metadata domain of "opendata.utah.gov".
+    They can be found in the link under API,"https://opendata.utah.gov/resource/q22t-rbk9.json".  Both the two items will be used for data retriving.
+    """
+
+    metadata_domain = knext.StringParameter(
+        label="Metadata domain",
+        description="""The value in the column metadata.domain of a table generated by a Socrata Search node. """,
+        default_value="",
+    )
+
+    resource_id = knext.StringParameter(
+        label="Resource ID",
+        description="""The value in the column resource.id of a table generated by a Socrata Search node. """,
+        default_value="",
+    )
+
+    def configure(self, configure_context):
+        # TODO Create combined schema
+        return None
+
+    def execute(self, exec_context: knext.ExecutionContext):
+        import pandas as pd
+        import json
+        import pandas as pd
+        from sodapy import Socrata
+
+        # Unauthenticated client only works with public data sets. Note 'None'
+        # in place of application token, and no username or password:
+        client = Socrata(self.metadata_domain, None)
+
+        # Example authenticated client (needed for non-public datasets):
+        # client = Socrata(data.cdc.gov,
+        #                  MyAppToken,
+        #                  username="user@example.com",
+        #                  password="AFakePassword")
+
+        # First 2000 results, returned as JSON from API / converted to Python list of
+        # dictionaries by sodapy.
+        results = client.get(self.resource_id, limit=100000)
+
+        # Convert to pandas DataFrame
+        results_df = pd.DataFrame.from_records(results)
+
+        return knext.Table.from_pandas(results_df)

From d38ce33220871eab87e7b7046f9ffadceaa0c784 Mon Sep 17 00:00:00 2001
From: Lingbo Liu <piantu@hotmail.com>
Date: Tue, 5 Mar 2024 08:05:02 -0500
Subject: [PATCH 07/12] revise socrata

---
 knime_extension/src/nodes/opendata.py | 66 +++++++++++++++++++++------
 1 file changed, 53 insertions(+), 13 deletions(-)

diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py
index e09b7dc9..102fd90a 100644
--- a/knime_extension/src/nodes/opendata.py
+++ b/knime_extension/src/nodes/opendata.py
@@ -892,11 +892,26 @@ def execute(self, exec_context: knext.ExecutionContext):
 )
 @knext.output_table(
     name="Socrata dataset list",
-    description="Socrata dataset based on search keywords",
+    description="Socrata dataset list from  a wealth of open data resources from governments, non-profits, and NGOs around the world based on the query term. ",
 )
 class SocrataSearchNode:
-    """Retrive the open data category via Socrata API.
-
+    """Access open datasets from various well-known data resources and organizations effortlessly  using the SODA interface. 
+    
+    US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, 
+    injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, 
+    epidemiological studies, and collaborative research efforts conducted by the CDC and its partners.
+
+    Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, 
+    healthcare, transportation, and the environment.
+
+    Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more.
+    
+    NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city.
+    
+    UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more.
+    
+    World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends.
+       
     The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs..
     This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset list.
     """
@@ -917,10 +932,12 @@ def execute(self, exec_context: knext.ExecutionContext):
         import pandas as pd
         import json
         from pandas import json_normalize
+        from urllib.parse import quote
 
         query_item = self.queryitem
+        encoded_query_item = quote(query_item)
         request = Request(
-            f"http://api.us.socrata.com/api/catalog/v1?q={query_item}&only=datasets&limit=10000"
+            f"http://api.us.socrata.com/api/catalog/v1?q={encoded_query_item}&only=datasets&limit=10000"
         )
 
         response = urlopen(request)
@@ -934,9 +951,11 @@ def execute(self, exec_context: knext.ExecutionContext):
 
         # Create a DataFrame from the dataset information, and flatten the nested dictionaries
         df = json_normalize(dataset_info)
-        df = df.drop(
-            columns=["classification.domain_tags", "classification.domain_metadata"]
-        )
+        # Check if columns exist before dropping them
+        columns_to_drop = ["classification.domain_tags", "classification.domain_metadata"]
+        columns_to_drop = [col for col in columns_to_drop if col in df.columns]
+        df = df.drop(columns=columns_to_drop)
+
 
         # Find List
         list_columns = [
@@ -986,8 +1005,23 @@ def execute(self, exec_context: knext.ExecutionContext):
     description="Socrata dataset based on search keywords",
 )
 class SocrataDataNode:
-    """Retrive the open data category via Socrata API.
-
+    """Retrieve the open data category via Socrata API.
+
+    US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, 
+    injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, 
+    epidemiological studies, and collaborative research efforts conducted by the CDC and its partners.
+
+    Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, 
+    healthcare, transportation, and the environment.
+
+    Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more.
+    
+    NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city.
+    
+    UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more.
+    
+    World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends.   
+  
     The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs..
     This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset from a dataset list generated by Socrata Search Node.
 
@@ -1020,7 +1054,15 @@ def execute(self, exec_context: knext.ExecutionContext):
         # Unauthenticated client only works with public data sets. Note 'None'
         # in place of application token, and no username or password:
         client = Socrata(self.metadata_domain, None)
-
+        limit = 100000 
+        offset = 0
+        all_results = []
+        while True:
+            results = client.get(self.resource_id, limit=limit, offset=offset)
+            if not results:
+                break
+            all_results.extend(results)
+            offset += limit
         # Example authenticated client (needed for non-public datasets):
         # client = Socrata(data.cdc.gov,
         #                  MyAppToken,
@@ -1029,9 +1071,7 @@ def execute(self, exec_context: knext.ExecutionContext):
 
         # First 2000 results, returned as JSON from API / converted to Python list of
         # dictionaries by sodapy.
-        results = client.get(self.resource_id, limit=100000)
-
         # Convert to pandas DataFrame
-        results_df = pd.DataFrame.from_records(results)
+        results_df = pd.DataFrame.from_records(all_results)
 
         return knext.Table.from_pandas(results_df)

From da4ed57a32efe7c17ecd7be9cb77835871353a1c Mon Sep 17 00:00:00 2001
From: UrbanGISer <piantu@hotmail.com>
Date: Tue, 14 Nov 2023 17:30:41 -0500
Subject: [PATCH 08/12] Add Socrata Data Nodes

---
 knime_extension/src/nodes/opendata.py | 66 ++++++---------------------
 1 file changed, 13 insertions(+), 53 deletions(-)

diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py
index 102fd90a..e09b7dc9 100644
--- a/knime_extension/src/nodes/opendata.py
+++ b/knime_extension/src/nodes/opendata.py
@@ -892,26 +892,11 @@ def execute(self, exec_context: knext.ExecutionContext):
 )
 @knext.output_table(
     name="Socrata dataset list",
-    description="Socrata dataset list from  a wealth of open data resources from governments, non-profits, and NGOs around the world based on the query term. ",
+    description="Socrata dataset based on search keywords",
 )
 class SocrataSearchNode:
-    """Access open datasets from various well-known data resources and organizations effortlessly  using the SODA interface. 
-    
-    US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, 
-    injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, 
-    epidemiological studies, and collaborative research efforts conducted by the CDC and its partners.
-
-    Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, 
-    healthcare, transportation, and the environment.
-
-    Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more.
-    
-    NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city.
-    
-    UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more.
-    
-    World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends.
-       
+    """Retrive the open data category via Socrata API.
+
     The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs..
     This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset list.
     """
@@ -932,12 +917,10 @@ def execute(self, exec_context: knext.ExecutionContext):
         import pandas as pd
         import json
         from pandas import json_normalize
-        from urllib.parse import quote
 
         query_item = self.queryitem
-        encoded_query_item = quote(query_item)
         request = Request(
-            f"http://api.us.socrata.com/api/catalog/v1?q={encoded_query_item}&only=datasets&limit=10000"
+            f"http://api.us.socrata.com/api/catalog/v1?q={query_item}&only=datasets&limit=10000"
         )
 
         response = urlopen(request)
@@ -951,11 +934,9 @@ def execute(self, exec_context: knext.ExecutionContext):
 
         # Create a DataFrame from the dataset information, and flatten the nested dictionaries
         df = json_normalize(dataset_info)
-        # Check if columns exist before dropping them
-        columns_to_drop = ["classification.domain_tags", "classification.domain_metadata"]
-        columns_to_drop = [col for col in columns_to_drop if col in df.columns]
-        df = df.drop(columns=columns_to_drop)
-
+        df = df.drop(
+            columns=["classification.domain_tags", "classification.domain_metadata"]
+        )
 
         # Find List
         list_columns = [
@@ -1005,23 +986,8 @@ def execute(self, exec_context: knext.ExecutionContext):
     description="Socrata dataset based on search keywords",
 )
 class SocrataDataNode:
-    """Retrieve the open data category via Socrata API.
-
-    US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, 
-    injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, 
-    epidemiological studies, and collaborative research efforts conducted by the CDC and its partners.
-
-    Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, 
-    healthcare, transportation, and the environment.
-
-    Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more.
-    
-    NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city.
-    
-    UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more.
-    
-    World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends.   
-  
+    """Retrive the open data category via Socrata API.
+
     The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs..
     This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset from a dataset list generated by Socrata Search Node.
 
@@ -1054,15 +1020,7 @@ def execute(self, exec_context: knext.ExecutionContext):
         # Unauthenticated client only works with public data sets. Note 'None'
         # in place of application token, and no username or password:
         client = Socrata(self.metadata_domain, None)
-        limit = 100000 
-        offset = 0
-        all_results = []
-        while True:
-            results = client.get(self.resource_id, limit=limit, offset=offset)
-            if not results:
-                break
-            all_results.extend(results)
-            offset += limit
+
         # Example authenticated client (needed for non-public datasets):
         # client = Socrata(data.cdc.gov,
         #                  MyAppToken,
@@ -1071,7 +1029,9 @@ def execute(self, exec_context: knext.ExecutionContext):
 
         # First 2000 results, returned as JSON from API / converted to Python list of
         # dictionaries by sodapy.
+        results = client.get(self.resource_id, limit=100000)
+
         # Convert to pandas DataFrame
-        results_df = pd.DataFrame.from_records(all_results)
+        results_df = pd.DataFrame.from_records(results)
 
         return knext.Table.from_pandas(results_df)

From fd34763b23431c7ea561f06f8e2d3b8c4435a0e7 Mon Sep 17 00:00:00 2001
From: Lingbo Liu <piantu@hotmail.com>
Date: Tue, 5 Mar 2024 08:05:02 -0500
Subject: [PATCH 09/12] revise socrata

---
 knime_extension/src/nodes/opendata.py | 66 +++++++++++++++++++++------
 1 file changed, 53 insertions(+), 13 deletions(-)

diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py
index e09b7dc9..102fd90a 100644
--- a/knime_extension/src/nodes/opendata.py
+++ b/knime_extension/src/nodes/opendata.py
@@ -892,11 +892,26 @@ def execute(self, exec_context: knext.ExecutionContext):
 )
 @knext.output_table(
     name="Socrata dataset list",
-    description="Socrata dataset based on search keywords",
+    description="Socrata dataset list from  a wealth of open data resources from governments, non-profits, and NGOs around the world based on the query term. ",
 )
 class SocrataSearchNode:
-    """Retrive the open data category via Socrata API.
-
+    """Access open datasets from various well-known data resources and organizations effortlessly  using the SODA interface. 
+    
+    US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, 
+    injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, 
+    epidemiological studies, and collaborative research efforts conducted by the CDC and its partners.
+
+    Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, 
+    healthcare, transportation, and the environment.
+
+    Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more.
+    
+    NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city.
+    
+    UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more.
+    
+    World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends.
+       
     The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs..
     This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset list.
     """
@@ -917,10 +932,12 @@ def execute(self, exec_context: knext.ExecutionContext):
         import pandas as pd
         import json
         from pandas import json_normalize
+        from urllib.parse import quote
 
         query_item = self.queryitem
+        encoded_query_item = quote(query_item)
         request = Request(
-            f"http://api.us.socrata.com/api/catalog/v1?q={query_item}&only=datasets&limit=10000"
+            f"http://api.us.socrata.com/api/catalog/v1?q={encoded_query_item}&only=datasets&limit=10000"
         )
 
         response = urlopen(request)
@@ -934,9 +951,11 @@ def execute(self, exec_context: knext.ExecutionContext):
 
         # Create a DataFrame from the dataset information, and flatten the nested dictionaries
         df = json_normalize(dataset_info)
-        df = df.drop(
-            columns=["classification.domain_tags", "classification.domain_metadata"]
-        )
+        # Check if columns exist before dropping them
+        columns_to_drop = ["classification.domain_tags", "classification.domain_metadata"]
+        columns_to_drop = [col for col in columns_to_drop if col in df.columns]
+        df = df.drop(columns=columns_to_drop)
+
 
         # Find List
         list_columns = [
@@ -986,8 +1005,23 @@ def execute(self, exec_context: knext.ExecutionContext):
     description="Socrata dataset based on search keywords",
 )
 class SocrataDataNode:
-    """Retrive the open data category via Socrata API.
-
+    """Retrieve the open data category via Socrata API.
+
+    US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, 
+    injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, 
+    epidemiological studies, and collaborative research efforts conducted by the CDC and its partners.
+
+    Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, 
+    healthcare, transportation, and the environment.
+
+    Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more.
+    
+    NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city.
+    
+    UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more.
+    
+    World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends.   
+  
     The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs..
     This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset from a dataset list generated by Socrata Search Node.
 
@@ -1020,7 +1054,15 @@ def execute(self, exec_context: knext.ExecutionContext):
         # Unauthenticated client only works with public data sets. Note 'None'
         # in place of application token, and no username or password:
         client = Socrata(self.metadata_domain, None)
-
+        limit = 100000 
+        offset = 0
+        all_results = []
+        while True:
+            results = client.get(self.resource_id, limit=limit, offset=offset)
+            if not results:
+                break
+            all_results.extend(results)
+            offset += limit
         # Example authenticated client (needed for non-public datasets):
         # client = Socrata(data.cdc.gov,
         #                  MyAppToken,
@@ -1029,9 +1071,7 @@ def execute(self, exec_context: knext.ExecutionContext):
 
         # First 2000 results, returned as JSON from API / converted to Python list of
         # dictionaries by sodapy.
-        results = client.get(self.resource_id, limit=100000)
-
         # Convert to pandas DataFrame
-        results_df = pd.DataFrame.from_records(results)
+        results_df = pd.DataFrame.from_records(all_results)
 
         return knext.Table.from_pandas(results_df)

From df68fa27f5a0f17e62df93ca191462c1f1c10809 Mon Sep 17 00:00:00 2001
From: Lingbo Liu <piantu@hotmail.com>
Date: Thu, 6 Feb 2025 14:24:32 -0500
Subject: [PATCH 10/12] add time out  parameters

---
 knime_extension/src/nodes/opendata.py | 63 +++++++++++++++++----------
 1 file changed, 41 insertions(+), 22 deletions(-)

diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py
index 102fd90a..70c16c54 100644
--- a/knime_extension/src/nodes/opendata.py
+++ b/knime_extension/src/nodes/opendata.py
@@ -895,23 +895,23 @@ def execute(self, exec_context: knext.ExecutionContext):
     description="Socrata dataset list from  a wealth of open data resources from governments, non-profits, and NGOs around the world based on the query term. ",
 )
 class SocrataSearchNode:
-    """Access open datasets from various well-known data resources and organizations effortlessly  using the SODA interface. 
-    
-    US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, 
-    injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, 
+    """Access open datasets from various well-known data resources and organizations effortlessly  using the SODA interface.
+
+    US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards,
+    injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys,
     epidemiological studies, and collaborative research efforts conducted by the CDC and its partners.
 
-    Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, 
+    Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education,
     healthcare, transportation, and the environment.
 
     Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more.
-    
+
     NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city.
-    
+
     UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more.
-    
+
     World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends.
-       
+
     The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs..
     This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset list.
     """
@@ -923,6 +923,14 @@ class SocrataSearchNode:
         default_value="Massachusetts",
     )
 
+    timeout = knext.IntParameter(
+        label="Request timeout in seconds",
+        description="The timeout in seconds for the request API.",
+        default_value=120,
+        min_value=1,
+        is_advanced=True,
+    )
+
     def configure(self, configure_context):
         # TODO Create combined schema
         return None
@@ -940,7 +948,7 @@ def execute(self, exec_context: knext.ExecutionContext):
             f"http://api.us.socrata.com/api/catalog/v1?q={encoded_query_item}&only=datasets&limit=10000"
         )
 
-        response = urlopen(request)
+        response = urlopen(request, timeout=self.timeout)
         response_body = response.read()
 
         # Load the JSON response into a Python dictionary
@@ -952,11 +960,13 @@ def execute(self, exec_context: knext.ExecutionContext):
         # Create a DataFrame from the dataset information, and flatten the nested dictionaries
         df = json_normalize(dataset_info)
         # Check if columns exist before dropping them
-        columns_to_drop = ["classification.domain_tags", "classification.domain_metadata"]
+        columns_to_drop = [
+            "classification.domain_tags",
+            "classification.domain_metadata",
+        ]
         columns_to_drop = [col for col in columns_to_drop if col in df.columns]
         df = df.drop(columns=columns_to_drop)
 
-
         # Find List
         list_columns = [
             col for col in df.columns if any(isinstance(item, list) for item in df[col])
@@ -1007,21 +1017,21 @@ def execute(self, exec_context: knext.ExecutionContext):
 class SocrataDataNode:
     """Retrieve the open data category via Socrata API.
 
-    US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, 
-    injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, 
+    US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards,
+    injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys,
     epidemiological studies, and collaborative research efforts conducted by the CDC and its partners.
 
-    Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, 
+    Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education,
     healthcare, transportation, and the environment.
 
     Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more.
-    
+
     NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city.
-    
+
     UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more.
-    
-    World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends.   
-  
+
+    World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends.
+
     The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs..
     This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset from a dataset list generated by Socrata Search Node.
 
@@ -1041,7 +1051,15 @@ class SocrataDataNode:
         default_value="",
     )
 
-    def configure(self, configure_context):
+    timeout = knext.IntParameter(
+        label="Request timeout in seconds",
+        description="The timeout in seconds for the request API.",
+        default_value=120,
+        min_value=1,
+        is_advanced=True,
+    )
+
+    def configure(self, configure_context, input_schema_1):
         # TODO Create combined schema
         return None
 
@@ -1054,7 +1072,8 @@ def execute(self, exec_context: knext.ExecutionContext):
         # Unauthenticated client only works with public data sets. Note 'None'
         # in place of application token, and no username or password:
         client = Socrata(self.metadata_domain, None)
-        limit = 100000 
+        client.timeout = self.timeout
+        limit = 100000
         offset = 0
         all_results = []
         while True:

From 1049e56cc0d638e661eb0004656512cea60815d0 Mon Sep 17 00:00:00 2001
From: Tobias Koetter <tobias.koetter@knime.com>
Date: Fri, 28 Mar 2025 17:10:43 +0100
Subject: [PATCH 11/12] Fix bug in configure of Socrata Data Query node

---
 knime_extension/src/nodes/opendata.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py
index 70c16c54..fe2de0ce 100644
--- a/knime_extension/src/nodes/opendata.py
+++ b/knime_extension/src/nodes/opendata.py
@@ -892,10 +892,10 @@ def execute(self, exec_context: knext.ExecutionContext):
 )
 @knext.output_table(
     name="Socrata dataset list",
-    description="Socrata dataset list from  a wealth of open data resources from governments, non-profits, and NGOs around the world based on the query term. ",
+    description="Socrata dataset list from a wealth of open data resources from governments, non-profits, and NGOs around the world based on the query term.",
 )
 class SocrataSearchNode:
-    """Access open datasets from various well-known data resources and organizations effortlessly  using the SODA interface.
+    """Access open datasets from various well-known data resources and organizations effortlessly using the SODA interface.
 
     US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards,
     injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys,
@@ -1059,7 +1059,7 @@ class SocrataDataNode:
         is_advanced=True,
     )
 
-    def configure(self, configure_context, input_schema_1):
+    def configure(self, configure_context):
         # TODO Create combined schema
         return None
 

From 3070ac4c3e51400ebc07eaf5e3b7f5183272d9a4 Mon Sep 17 00:00:00 2001
From: Lingbo Liu <piantu@hotmail.com>
Date: Fri, 2 May 2025 12:50:55 -0400
Subject: [PATCH 12/12] update Socrata Nodes

add  only open dataset available in Search Node Description;   ass query-filter, api-token ,
set_progress and error handling in Data Node
---
 knime_extension/src/nodes/opendata.py | 236 +++++++++++++++++++++++---
 1 file changed, 217 insertions(+), 19 deletions(-)

diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py
index fe2de0ce..eaf5b26d 100644
--- a/knime_extension/src/nodes/opendata.py
+++ b/knime_extension/src/nodes/opendata.py
@@ -912,7 +912,9 @@ class SocrataSearchNode:
 
     World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends.
 
-    The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs..
+    The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs.
+
+    Note: This node only retrieves publicly available datasets, as no authentication is provided.
     This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset list.
     """
 
@@ -997,6 +999,22 @@ def execute(self, exec_context: knext.ExecutionContext):
         df.replace("", pd.NA, inplace=True)
         df.dropna(axis=1, how="all", inplace=True)
         df = df.reset_index(drop=True)
+
+        # Reorder the columns to have "metadata.domain" and "resource.id" at the beginning
+        important_cols = []
+        if "metadata.domain" in df.columns:
+            important_cols.append("metadata.domain")
+        if "resource.id" in df.columns:
+            important_cols.append("resource.id")
+
+        # Create the reordered column list
+        remaining_cols = [col for col in df.columns if col not in important_cols]
+        reordered_cols = important_cols + remaining_cols
+
+        # Reorder the DataFrame columns
+        if important_cols:
+            df = df[reordered_cols]
+
         return knext.Table.from_pandas(df)
 
 
@@ -1051,6 +1069,34 @@ class SocrataDataNode:
         default_value="",
     )
 
+    app_token = knext.StringParameter(
+        label="Application Token",
+        description="""Optional: Provide an application token to increase API request limits. 
+                    You can register for a token at [Application Tokens](https://dev.socrata.com/docs/app-tokens.html)""",
+        default_value="",
+        is_advanced=True,
+    )
+
+    query_filter = knext.StringParameter(
+        label="Query Filter",
+        description="""Provide filtering conditions to narrow down results. Socrata API supports two main filtering mechanisms:
+
+        1. Simple Filters: Use column names directly as parameters. Examples:
+        - source=nn (query for records where source field equals 'nn')
+        - source=pr&region=Virgin Islands region (multiple conditions, combined with AND)
+
+        2. SoQL Query Language: Use $where parameter for more complex queries:
+        - $where=magnitude > 3.0 (numeric comparison)
+        - $where=datetime > '2020-01-01' (date comparison)
+        - $where=state='NY' AND age > 30 (combined conditions)
+        - $where=annual_salary between '40000' and '60000' (range query)
+
+        For more information, see [Simple filter](https://dev.socrata.com/docs/filtering.html) and [SoQL Queries](https://dev.socrata.com/docs/queries/)
+        """,
+        default_value="",
+        is_advanced=True,
+    )
+
     timeout = knext.IntParameter(
         label="Request timeout in seconds",
         description="The timeout in seconds for the request API.",
@@ -1066,31 +1112,183 @@ def configure(self, configure_context):
     def execute(self, exec_context: knext.ExecutionContext):
         import pandas as pd
         import json
-        import pandas as pd
         from sodapy import Socrata
+        import requests
+
+        # Start with validation phase
+        exec_context.set_progress(0.05, "Validating connection and parameters...")
+
+        # Validate the required parameters
+        if not self.metadata_domain or self.metadata_domain.strip() == "":
+            raise knext.InvalidParametersError("Metadata domain cannot be empty")
+
+        if not self.resource_id or self.resource_id.strip() == "":
+            raise knext.InvalidParametersError("Resource ID cannot be empty")
+
+        # Initialize app token if provided
+        app_token = (
+            None
+            if not self.app_token or self.app_token.strip() == ""
+            else self.app_token
+        )
+
+        # Create Socrata client and validate it works
+        try:
+            client = Socrata(self.metadata_domain, app_token)
+            client.timeout = self.timeout
+
+            # Validate that we can connect to the API with a minimal test query
+            validation_params = {"$limit": 1}
 
-        # Unauthenticated client only works with public data sets. Note 'None'
-        # in place of application token, and no username or password:
-        client = Socrata(self.metadata_domain, None)
-        client.timeout = self.timeout
+            # First validate the connection and resource ID by attempting a minimal query
+            test_results = client.get(self.resource_id, **validation_params)
+
+            if test_results is None:
+                raise knext.InvalidParametersError(
+                    f"Resource ID '{self.resource_id}' is invalid or not accessible"
+                )
+
+        except Exception as e:
+            # Connection or resource ID validation failed
+            error_message = str(e)
+            raise knext.InvalidParametersError(
+                f"Failed to connect to Socrata API: {error_message}"
+            )
+
+        # Now validate query filter if provided
+        query_params = {}
+        if self.query_filter and self.query_filter.strip() != "":
+            filter_text = self.query_filter.strip()
+
+            try:
+                # Parse the filter based on its format
+                if filter_text.lower().startswith("$where="):
+                    query_params["$where"] = filter_text[7:]
+                elif "=" in filter_text and not filter_text.startswith("$"):
+                    filter_pairs = filter_text.split("&")
+                    for pair in filter_pairs:
+                        if "=" in pair:
+                            key, value = pair.split("=", 1)
+                            query_params[key.strip()] = value.strip()
+                else:
+                    query_params["$where"] = filter_text
+
+                # Validate the filter by making a test query
+                test_filter_params = {**query_params, "$limit": 1}
+                test_filter_results = client.get(self.resource_id, **test_filter_params)
+
+                # If we get here, the filter is valid
+
+            except Exception as e:
+                # Filter validation failed
+                error_message = str(e)
+                raise knext.InvalidParametersError(
+                    f"Invalid SQL Filter: {error_message}"
+                )
+
+        # If all validations pass, proceed to data retrieval phase
+        exec_context.set_progress(
+            0.1, "Connection and parameters validated. Determining dataset size..."
+        )
+
+        # Try to get the total count to calculate progress
+        total_records = 0
+        try:
+            # Add COUNT(*) to our validated query parameters
+            count_params = {**query_params, "$select": "COUNT(*)"}
+            count_results = client.get(self.resource_id, **count_params)
+            total_records = int(count_results[0]["COUNT"]) if count_results else 0
+
+            exec_context.set_progress(
+                0.15, f"Dataset size: {total_records:,} records. Starting download..."
+            )
+        except Exception as e:
+            # Non-critical error: proceed without exact count
+            exec_context.set_progress(
+                0.15,
+                f"Could not determine dataset size: {str(e)}. Proceeding with download...",
+            )
+            total_records = 0
+
+        # Check if there's data to retrieve
+        if total_records == 0:
+            # Try a test query to see if there's any data at all
+            test_data_params = {**query_params, "$limit": 1}
+            test_data_results = client.get(self.resource_id, **test_data_params)
+
+            if not test_data_results:
+                # No data found for this query
+                raise knext.InvalidParametersError("No data found for the given query")
+
+        # Main data retrieval phase
         limit = 100000
         offset = 0
         all_results = []
+        rows_retrieved = 0
+        iteration_count = 0
+
         while True:
-            results = client.get(self.resource_id, limit=limit, offset=offset)
-            if not results:
-                break
-            all_results.extend(results)
-            offset += limit
-        # Example authenticated client (needed for non-public datasets):
-        # client = Socrata(data.cdc.gov,
-        #                  MyAppToken,
-        #                  username="user@example.com",
-        #                  password="AFakePassword")
-
-        # First 2000 results, returned as JSON from API / converted to Python list of
-        # dictionaries by sodapy.
+            # Update progress
+            if total_records > 0:
+                progress_percent = min(
+                    0.15 + 0.75 * (rows_retrieved / total_records), 0.9
+                )
+                progress_message = f"Downloaded {rows_retrieved:,} of {total_records:,} records ({(rows_retrieved / total_records * 100):.1f}%)"
+            else:
+                progress_percent = min(0.15 + (iteration_count * 0.05), 0.9)
+                progress_message = (
+                    f"Downloaded {rows_retrieved:,} records so far (offset: {offset:,})"
+                )
+
+            exec_context.set_progress(progress_percent, progress_message)
+            iteration_count += 1
+
+            # Prepare pagination parameters
+            current_params = {**query_params, "$limit": limit, "$offset": offset}
+
+            try:
+                # Get results
+                results = client.get(self.resource_id, **current_params)
+
+                if not results:
+                    # End of pagination
+                    break
+
+                all_results.extend(results)
+                rows_retrieved += len(results)
+                offset += limit
+
+            except requests.exceptions.HTTPError as e:
+                # Handle HTTP errors that might occur during pagination
+                error_message = str(e)
+                if 400 <= e.response.status_code < 500:
+                    raise knext.InvalidParametersError(
+                        f"Query error during pagination: {error_message}"
+                    )
+                else:
+                    raise knext.InvalidParametersError(
+                        f"Server error during data retrieval: {error_message}"
+                    )
+            except Exception as e:
+                # Handle other exceptions
+                error_message = str(e)
+                raise knext.InvalidParametersError(
+                    f"Error during data retrieval: {error_message}"
+                )
+
+        # Final processing phase
+        if not all_results:
+            raise knext.InvalidParametersError(
+                "No data was retrieved. The query may be too restrictive."
+            )
+
+        exec_context.set_progress(0.95, f"Processing {rows_retrieved:,} records...")
+
         # Convert to pandas DataFrame
         results_df = pd.DataFrame.from_records(all_results)
 
+        # Complete
+        exec_context.set_progress(
+            1.0, f"Complete: Retrieved {rows_retrieved:,} records"
+        )
         return knext.Table.from_pandas(results_df)