From 9901253f465f5db4a7d75e645e493f92c1a66a7c Mon Sep 17 00:00:00 2001 From: UrbanGISer Date: Tue, 14 Nov 2023 17:30:41 -0500 Subject: [PATCH 01/11] Add Socrata Data Nodes --- knime_extension/geospatial_env.yml | 1 + knime_extension/src/nodes/opendata.py | 157 ++++++++++++++++++++++++++ 2 files changed, 158 insertions(+) diff --git a/knime_extension/geospatial_env.yml b/knime_extension/geospatial_env.yml index 46a6a72d..fb95bb2d 100644 --- a/knime_extension/geospatial_env.yml +++ b/knime_extension/geospatial_env.yml @@ -35,3 +35,4 @@ dependencies: - pointpats=2.3.0 - pip: - ipinfo==4.4.3 + - sodapy==2.2.0 diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py index b9a47de3..9a370f48 100644 --- a/knime_extension/src/nodes/opendata.py +++ b/knime_extension/src/nodes/opendata.py @@ -686,3 +686,160 @@ def execute(self, exec_context: knext.ExecutionContext): gdf = get_osmnx().geocoder.geocode_to_gdf(self.placename) gdf = gdf.reset_index(drop=True) return knext.Table.from_pandas(gdf) + + +############################################ +# Socrata Search +############################################ +@knext.node( + name="Socrata Search", + node_type=knext.NodeType.SOURCE, + icon_path=__NODE_ICON_PATH + "Socrata Search.png", + category=__category, + after="", +) +@knext.output_table( + name="Socrata dataset list", + description="Socrata dataset based on search keywords", +) +class SocrataSearchNode: + """Retrive the open data category via Socrata API. + + The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs.. + This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset list. + """ + + queryitem = knext.StringParameter( + label="Input searching item", + description="""Enter search keywords or dataset names to find relevant datasets in the Socrata database. + This search is not case-sensitive and can include multiple words separated by spaces. """, + default_value="Massachusetts", + ) + + def configure(self, configure_context): + # TODO Create combined schema + return None + + def execute(self, exec_context: knext.ExecutionContext): + from urllib.request import Request, urlopen + import pandas as pd + import json + from pandas import json_normalize + + query_item = self.queryitem + request = Request( + f"http://api.us.socrata.com/api/catalog/v1?q={query_item}&only=datasets&limit=10000" + ) + + response = urlopen(request) + response_body = response.read() + + # Load the JSON response into a Python dictionary + data = json.loads(response_body) + + # Extract the "results" key, which contains the dataset information + dataset_info = data["results"] + + # Create a DataFrame from the dataset information, and flatten the nested dictionaries + df = json_normalize(dataset_info) + df = df.drop( + columns=["classification.domain_tags", "classification.domain_metadata"] + ) + + # Find List + list_columns = [ + col for col in df.columns if any(isinstance(item, list) for item in df[col]) + ] + + # Drop error list column + for col in list_columns: + try: + df[col] = df[col].apply( + lambda x: ", ".join(x) if isinstance(x, list) else x + ) + except Exception as e: + df.drop(columns=[col], inplace=True) + + # Drop columns that cannot be saved in KNIME + drop_columns = [] + for col in df.columns: + try: + # Attempt to convert the column to a KNIME-compatible data type + knime_table = knext.Table.from_pandas(df[[col]]) + except Exception as e: + # If an exception is raised, add the column to the list of columns to drop + drop_columns.append(col) + + # Drop the columns that cannot be saved in KNIME + df.drop(columns=drop_columns, inplace=True) + df.replace("?", pd.NA, inplace=True) + df.replace("", pd.NA, inplace=True) + df.dropna(axis=1, how="all", inplace=True) + df = df.reset_index(drop=True) + return knext.Table.from_pandas(df) + + +############################################ +# Socrata Data Query +############################################ +@knext.node( + name="Socrata Data Query", + node_type=knext.NodeType.SOURCE, + icon_path=__NODE_ICON_PATH + "Socrata Data Query.png", + category=__category, + after="", +) +@knext.output_table( + name="Socrata dataset", + description="Socrata dataset based on search keywords", +) +class SocrataDataNode: + """Retrive the open data category via Socrata API. + + The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs.. + This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset from a dataset list generated by Socrata Search Node. + + For instance, this dataset [Incidence Rate Of Breast Cancer](https://opendata.utah.gov/Health/Incidence-Rate-Of-Breast-Cancer-Per-100-000-All-St/q22t-rbk9) has a resource_id of "q22t-rbk9" and a metadata domain of "opendata.utah.gov". + They can be found in the link under API,"https://opendata.utah.gov/resource/q22t-rbk9.json". Both the two items will be used for data retriving. + """ + + metadata_domain = knext.StringParameter( + label="Metadata domain", + description="""The value in the column metadata.domain of a table generated by a Socrata Search node. """, + default_value="", + ) + + resource_id = knext.StringParameter( + label="Resource ID", + description="""The value in the column resource.id of a table generated by a Socrata Search node. """, + default_value="", + ) + + def configure(self, configure_context): + # TODO Create combined schema + return None + + def execute(self, exec_context: knext.ExecutionContext): + import pandas as pd + import json + import pandas as pd + from sodapy import Socrata + + # Unauthenticated client only works with public data sets. Note 'None' + # in place of application token, and no username or password: + client = Socrata(self.metadata_domain, None) + + # Example authenticated client (needed for non-public datasets): + # client = Socrata(data.cdc.gov, + # MyAppToken, + # username="user@example.com", + # password="AFakePassword") + + # First 2000 results, returned as JSON from API / converted to Python list of + # dictionaries by sodapy. + results = client.get(self.resource_id, limit=100000) + + # Convert to pandas DataFrame + results_df = pd.DataFrame.from_records(results) + + return knext.Table.from_pandas(results_df) From 29de8d32f1a58da8073d6556d0fa0a5df7a3bd32 Mon Sep 17 00:00:00 2001 From: Lingbo Liu Date: Tue, 5 Mar 2024 08:05:02 -0500 Subject: [PATCH 02/11] revise socrata --- knime_extension/src/nodes/opendata.py | 66 +++++++++++++++++++++------ 1 file changed, 53 insertions(+), 13 deletions(-) diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py index 9a370f48..dd32fa1f 100644 --- a/knime_extension/src/nodes/opendata.py +++ b/knime_extension/src/nodes/opendata.py @@ -700,11 +700,26 @@ def execute(self, exec_context: knext.ExecutionContext): ) @knext.output_table( name="Socrata dataset list", - description="Socrata dataset based on search keywords", + description="Socrata dataset list from a wealth of open data resources from governments, non-profits, and NGOs around the world based on the query term. ", ) class SocrataSearchNode: - """Retrive the open data category via Socrata API. - + """Access open datasets from various well-known data resources and organizations effortlessly using the SODA interface. + + US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, + injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, + epidemiological studies, and collaborative research efforts conducted by the CDC and its partners. + + Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, + healthcare, transportation, and the environment. + + Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more. + + NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city. + + UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more. + + World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends. + The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs.. This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset list. """ @@ -725,10 +740,12 @@ def execute(self, exec_context: knext.ExecutionContext): import pandas as pd import json from pandas import json_normalize + from urllib.parse import quote query_item = self.queryitem + encoded_query_item = quote(query_item) request = Request( - f"http://api.us.socrata.com/api/catalog/v1?q={query_item}&only=datasets&limit=10000" + f"http://api.us.socrata.com/api/catalog/v1?q={encoded_query_item}&only=datasets&limit=10000" ) response = urlopen(request) @@ -742,9 +759,11 @@ def execute(self, exec_context: knext.ExecutionContext): # Create a DataFrame from the dataset information, and flatten the nested dictionaries df = json_normalize(dataset_info) - df = df.drop( - columns=["classification.domain_tags", "classification.domain_metadata"] - ) + # Check if columns exist before dropping them + columns_to_drop = ["classification.domain_tags", "classification.domain_metadata"] + columns_to_drop = [col for col in columns_to_drop if col in df.columns] + df = df.drop(columns=columns_to_drop) + # Find List list_columns = [ @@ -794,8 +813,23 @@ def execute(self, exec_context: knext.ExecutionContext): description="Socrata dataset based on search keywords", ) class SocrataDataNode: - """Retrive the open data category via Socrata API. - + """Retrieve the open data category via Socrata API. + + US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, + injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, + epidemiological studies, and collaborative research efforts conducted by the CDC and its partners. + + Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, + healthcare, transportation, and the environment. + + Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more. + + NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city. + + UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more. + + World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends. + The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs.. This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset from a dataset list generated by Socrata Search Node. @@ -828,7 +862,15 @@ def execute(self, exec_context: knext.ExecutionContext): # Unauthenticated client only works with public data sets. Note 'None' # in place of application token, and no username or password: client = Socrata(self.metadata_domain, None) - + limit = 100000 + offset = 0 + all_results = [] + while True: + results = client.get(self.resource_id, limit=limit, offset=offset) + if not results: + break + all_results.extend(results) + offset += limit # Example authenticated client (needed for non-public datasets): # client = Socrata(data.cdc.gov, # MyAppToken, @@ -837,9 +879,7 @@ def execute(self, exec_context: knext.ExecutionContext): # First 2000 results, returned as JSON from API / converted to Python list of # dictionaries by sodapy. - results = client.get(self.resource_id, limit=100000) - # Convert to pandas DataFrame - results_df = pd.DataFrame.from_records(results) + results_df = pd.DataFrame.from_records(all_results) return knext.Table.from_pandas(results_df) From 6e512200dff86fb954ea55a8212741081464138b Mon Sep 17 00:00:00 2001 From: UrbanGISer Date: Tue, 14 Nov 2023 17:30:41 -0500 Subject: [PATCH 03/11] Add Socrata Data Nodes --- knime_extension/geospatial_env.yml | 1 + knime_extension/src/nodes/opendata.py | 157 ++++++++++++++++++++++++++ 2 files changed, 158 insertions(+) diff --git a/knime_extension/geospatial_env.yml b/knime_extension/geospatial_env.yml index a549339f..844dada7 100644 --- a/knime_extension/geospatial_env.yml +++ b/knime_extension/geospatial_env.yml @@ -36,3 +36,4 @@ dependencies: - pip: - ipinfo==4.4.3 - pulp==2.7.0 + - sodapy==2.2.0 diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py index d7032034..e09b7dc9 100644 --- a/knime_extension/src/nodes/opendata.py +++ b/knime_extension/src/nodes/opendata.py @@ -878,3 +878,160 @@ def execute(self, exec_context: knext.ExecutionContext): crs="EPSG:4326", ) return knext.Table.from_pandas(gdf) + + +############################################ +# Socrata Search +############################################ +@knext.node( + name="Socrata Search", + node_type=knext.NodeType.SOURCE, + icon_path=__NODE_ICON_PATH + "Socrata Search.png", + category=__category, + after="", +) +@knext.output_table( + name="Socrata dataset list", + description="Socrata dataset based on search keywords", +) +class SocrataSearchNode: + """Retrive the open data category via Socrata API. + + The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs.. + This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset list. + """ + + queryitem = knext.StringParameter( + label="Input searching item", + description="""Enter search keywords or dataset names to find relevant datasets in the Socrata database. + This search is not case-sensitive and can include multiple words separated by spaces. """, + default_value="Massachusetts", + ) + + def configure(self, configure_context): + # TODO Create combined schema + return None + + def execute(self, exec_context: knext.ExecutionContext): + from urllib.request import Request, urlopen + import pandas as pd + import json + from pandas import json_normalize + + query_item = self.queryitem + request = Request( + f"http://api.us.socrata.com/api/catalog/v1?q={query_item}&only=datasets&limit=10000" + ) + + response = urlopen(request) + response_body = response.read() + + # Load the JSON response into a Python dictionary + data = json.loads(response_body) + + # Extract the "results" key, which contains the dataset information + dataset_info = data["results"] + + # Create a DataFrame from the dataset information, and flatten the nested dictionaries + df = json_normalize(dataset_info) + df = df.drop( + columns=["classification.domain_tags", "classification.domain_metadata"] + ) + + # Find List + list_columns = [ + col for col in df.columns if any(isinstance(item, list) for item in df[col]) + ] + + # Drop error list column + for col in list_columns: + try: + df[col] = df[col].apply( + lambda x: ", ".join(x) if isinstance(x, list) else x + ) + except Exception as e: + df.drop(columns=[col], inplace=True) + + # Drop columns that cannot be saved in KNIME + drop_columns = [] + for col in df.columns: + try: + # Attempt to convert the column to a KNIME-compatible data type + knime_table = knext.Table.from_pandas(df[[col]]) + except Exception as e: + # If an exception is raised, add the column to the list of columns to drop + drop_columns.append(col) + + # Drop the columns that cannot be saved in KNIME + df.drop(columns=drop_columns, inplace=True) + df.replace("?", pd.NA, inplace=True) + df.replace("", pd.NA, inplace=True) + df.dropna(axis=1, how="all", inplace=True) + df = df.reset_index(drop=True) + return knext.Table.from_pandas(df) + + +############################################ +# Socrata Data Query +############################################ +@knext.node( + name="Socrata Data Query", + node_type=knext.NodeType.SOURCE, + icon_path=__NODE_ICON_PATH + "Socrata Data Query.png", + category=__category, + after="", +) +@knext.output_table( + name="Socrata dataset", + description="Socrata dataset based on search keywords", +) +class SocrataDataNode: + """Retrive the open data category via Socrata API. + + The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs.. + This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset from a dataset list generated by Socrata Search Node. + + For instance, this dataset [Incidence Rate Of Breast Cancer](https://opendata.utah.gov/Health/Incidence-Rate-Of-Breast-Cancer-Per-100-000-All-St/q22t-rbk9) has a resource_id of "q22t-rbk9" and a metadata domain of "opendata.utah.gov". + They can be found in the link under API,"https://opendata.utah.gov/resource/q22t-rbk9.json". Both the two items will be used for data retriving. + """ + + metadata_domain = knext.StringParameter( + label="Metadata domain", + description="""The value in the column metadata.domain of a table generated by a Socrata Search node. """, + default_value="", + ) + + resource_id = knext.StringParameter( + label="Resource ID", + description="""The value in the column resource.id of a table generated by a Socrata Search node. """, + default_value="", + ) + + def configure(self, configure_context): + # TODO Create combined schema + return None + + def execute(self, exec_context: knext.ExecutionContext): + import pandas as pd + import json + import pandas as pd + from sodapy import Socrata + + # Unauthenticated client only works with public data sets. Note 'None' + # in place of application token, and no username or password: + client = Socrata(self.metadata_domain, None) + + # Example authenticated client (needed for non-public datasets): + # client = Socrata(data.cdc.gov, + # MyAppToken, + # username="user@example.com", + # password="AFakePassword") + + # First 2000 results, returned as JSON from API / converted to Python list of + # dictionaries by sodapy. + results = client.get(self.resource_id, limit=100000) + + # Convert to pandas DataFrame + results_df = pd.DataFrame.from_records(results) + + return knext.Table.from_pandas(results_df) From 103cbcd14d42702bbb3144e60d949832d1b07c91 Mon Sep 17 00:00:00 2001 From: Lingbo Liu Date: Tue, 5 Mar 2024 08:05:02 -0500 Subject: [PATCH 04/11] revise socrata --- knime_extension/src/nodes/opendata.py | 66 +++++++++++++++++++++------ 1 file changed, 53 insertions(+), 13 deletions(-) diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py index e09b7dc9..102fd90a 100644 --- a/knime_extension/src/nodes/opendata.py +++ b/knime_extension/src/nodes/opendata.py @@ -892,11 +892,26 @@ def execute(self, exec_context: knext.ExecutionContext): ) @knext.output_table( name="Socrata dataset list", - description="Socrata dataset based on search keywords", + description="Socrata dataset list from a wealth of open data resources from governments, non-profits, and NGOs around the world based on the query term. ", ) class SocrataSearchNode: - """Retrive the open data category via Socrata API. - + """Access open datasets from various well-known data resources and organizations effortlessly using the SODA interface. + + US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, + injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, + epidemiological studies, and collaborative research efforts conducted by the CDC and its partners. + + Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, + healthcare, transportation, and the environment. + + Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more. + + NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city. + + UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more. + + World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends. + The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs.. This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset list. """ @@ -917,10 +932,12 @@ def execute(self, exec_context: knext.ExecutionContext): import pandas as pd import json from pandas import json_normalize + from urllib.parse import quote query_item = self.queryitem + encoded_query_item = quote(query_item) request = Request( - f"http://api.us.socrata.com/api/catalog/v1?q={query_item}&only=datasets&limit=10000" + f"http://api.us.socrata.com/api/catalog/v1?q={encoded_query_item}&only=datasets&limit=10000" ) response = urlopen(request) @@ -934,9 +951,11 @@ def execute(self, exec_context: knext.ExecutionContext): # Create a DataFrame from the dataset information, and flatten the nested dictionaries df = json_normalize(dataset_info) - df = df.drop( - columns=["classification.domain_tags", "classification.domain_metadata"] - ) + # Check if columns exist before dropping them + columns_to_drop = ["classification.domain_tags", "classification.domain_metadata"] + columns_to_drop = [col for col in columns_to_drop if col in df.columns] + df = df.drop(columns=columns_to_drop) + # Find List list_columns = [ @@ -986,8 +1005,23 @@ def execute(self, exec_context: knext.ExecutionContext): description="Socrata dataset based on search keywords", ) class SocrataDataNode: - """Retrive the open data category via Socrata API. - + """Retrieve the open data category via Socrata API. + + US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, + injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, + epidemiological studies, and collaborative research efforts conducted by the CDC and its partners. + + Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, + healthcare, transportation, and the environment. + + Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more. + + NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city. + + UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more. + + World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends. + The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs.. This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset from a dataset list generated by Socrata Search Node. @@ -1020,7 +1054,15 @@ def execute(self, exec_context: knext.ExecutionContext): # Unauthenticated client only works with public data sets. Note 'None' # in place of application token, and no username or password: client = Socrata(self.metadata_domain, None) - + limit = 100000 + offset = 0 + all_results = [] + while True: + results = client.get(self.resource_id, limit=limit, offset=offset) + if not results: + break + all_results.extend(results) + offset += limit # Example authenticated client (needed for non-public datasets): # client = Socrata(data.cdc.gov, # MyAppToken, @@ -1029,9 +1071,7 @@ def execute(self, exec_context: knext.ExecutionContext): # First 2000 results, returned as JSON from API / converted to Python list of # dictionaries by sodapy. - results = client.get(self.resource_id, limit=100000) - # Convert to pandas DataFrame - results_df = pd.DataFrame.from_records(results) + results_df = pd.DataFrame.from_records(all_results) return knext.Table.from_pandas(results_df) From a9923a34772786bb757eb5ce496268ada2d713c3 Mon Sep 17 00:00:00 2001 From: Lingbo Liu Date: Thu, 6 Feb 2025 14:24:32 -0500 Subject: [PATCH 05/11] add time out parameters --- knime_extension/src/nodes/opendata.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py index e4beb6dd..70c16c54 100644 --- a/knime_extension/src/nodes/opendata.py +++ b/knime_extension/src/nodes/opendata.py @@ -923,6 +923,14 @@ class SocrataSearchNode: default_value="Massachusetts", ) + timeout = knext.IntParameter( + label="Request timeout in seconds", + description="The timeout in seconds for the request API.", + default_value=120, + min_value=1, + is_advanced=True, + ) + def configure(self, configure_context): # TODO Create combined schema return None @@ -940,7 +948,7 @@ def execute(self, exec_context: knext.ExecutionContext): f"http://api.us.socrata.com/api/catalog/v1?q={encoded_query_item}&only=datasets&limit=10000" ) - response = urlopen(request) + response = urlopen(request, timeout=self.timeout) response_body = response.read() # Load the JSON response into a Python dictionary @@ -1043,7 +1051,15 @@ class SocrataDataNode: default_value="", ) - def configure(self, configure_context): + timeout = knext.IntParameter( + label="Request timeout in seconds", + description="The timeout in seconds for the request API.", + default_value=120, + min_value=1, + is_advanced=True, + ) + + def configure(self, configure_context, input_schema_1): # TODO Create combined schema return None @@ -1056,6 +1072,7 @@ def execute(self, exec_context: knext.ExecutionContext): # Unauthenticated client only works with public data sets. Note 'None' # in place of application token, and no username or password: client = Socrata(self.metadata_domain, None) + client.timeout = self.timeout limit = 100000 offset = 0 all_results = [] From b625a6f4052a3216c48a57a731e9ec377e26a976 Mon Sep 17 00:00:00 2001 From: UrbanGISer Date: Tue, 14 Nov 2023 17:30:41 -0500 Subject: [PATCH 06/11] Add Socrata Data Nodes --- knime_extension/geospatial_env.yml | 1 + knime_extension/src/nodes/opendata.py | 157 ++++++++++++++++++++++++++ 2 files changed, 158 insertions(+) diff --git a/knime_extension/geospatial_env.yml b/knime_extension/geospatial_env.yml index a549339f..844dada7 100644 --- a/knime_extension/geospatial_env.yml +++ b/knime_extension/geospatial_env.yml @@ -36,3 +36,4 @@ dependencies: - pip: - ipinfo==4.4.3 - pulp==2.7.0 + - sodapy==2.2.0 diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py index d7032034..e09b7dc9 100644 --- a/knime_extension/src/nodes/opendata.py +++ b/knime_extension/src/nodes/opendata.py @@ -878,3 +878,160 @@ def execute(self, exec_context: knext.ExecutionContext): crs="EPSG:4326", ) return knext.Table.from_pandas(gdf) + + +############################################ +# Socrata Search +############################################ +@knext.node( + name="Socrata Search", + node_type=knext.NodeType.SOURCE, + icon_path=__NODE_ICON_PATH + "Socrata Search.png", + category=__category, + after="", +) +@knext.output_table( + name="Socrata dataset list", + description="Socrata dataset based on search keywords", +) +class SocrataSearchNode: + """Retrive the open data category via Socrata API. + + The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs.. + This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset list. + """ + + queryitem = knext.StringParameter( + label="Input searching item", + description="""Enter search keywords or dataset names to find relevant datasets in the Socrata database. + This search is not case-sensitive and can include multiple words separated by spaces. """, + default_value="Massachusetts", + ) + + def configure(self, configure_context): + # TODO Create combined schema + return None + + def execute(self, exec_context: knext.ExecutionContext): + from urllib.request import Request, urlopen + import pandas as pd + import json + from pandas import json_normalize + + query_item = self.queryitem + request = Request( + f"http://api.us.socrata.com/api/catalog/v1?q={query_item}&only=datasets&limit=10000" + ) + + response = urlopen(request) + response_body = response.read() + + # Load the JSON response into a Python dictionary + data = json.loads(response_body) + + # Extract the "results" key, which contains the dataset information + dataset_info = data["results"] + + # Create a DataFrame from the dataset information, and flatten the nested dictionaries + df = json_normalize(dataset_info) + df = df.drop( + columns=["classification.domain_tags", "classification.domain_metadata"] + ) + + # Find List + list_columns = [ + col for col in df.columns if any(isinstance(item, list) for item in df[col]) + ] + + # Drop error list column + for col in list_columns: + try: + df[col] = df[col].apply( + lambda x: ", ".join(x) if isinstance(x, list) else x + ) + except Exception as e: + df.drop(columns=[col], inplace=True) + + # Drop columns that cannot be saved in KNIME + drop_columns = [] + for col in df.columns: + try: + # Attempt to convert the column to a KNIME-compatible data type + knime_table = knext.Table.from_pandas(df[[col]]) + except Exception as e: + # If an exception is raised, add the column to the list of columns to drop + drop_columns.append(col) + + # Drop the columns that cannot be saved in KNIME + df.drop(columns=drop_columns, inplace=True) + df.replace("?", pd.NA, inplace=True) + df.replace("", pd.NA, inplace=True) + df.dropna(axis=1, how="all", inplace=True) + df = df.reset_index(drop=True) + return knext.Table.from_pandas(df) + + +############################################ +# Socrata Data Query +############################################ +@knext.node( + name="Socrata Data Query", + node_type=knext.NodeType.SOURCE, + icon_path=__NODE_ICON_PATH + "Socrata Data Query.png", + category=__category, + after="", +) +@knext.output_table( + name="Socrata dataset", + description="Socrata dataset based on search keywords", +) +class SocrataDataNode: + """Retrive the open data category via Socrata API. + + The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs.. + This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset from a dataset list generated by Socrata Search Node. + + For instance, this dataset [Incidence Rate Of Breast Cancer](https://opendata.utah.gov/Health/Incidence-Rate-Of-Breast-Cancer-Per-100-000-All-St/q22t-rbk9) has a resource_id of "q22t-rbk9" and a metadata domain of "opendata.utah.gov". + They can be found in the link under API,"https://opendata.utah.gov/resource/q22t-rbk9.json". Both the two items will be used for data retriving. + """ + + metadata_domain = knext.StringParameter( + label="Metadata domain", + description="""The value in the column metadata.domain of a table generated by a Socrata Search node. """, + default_value="", + ) + + resource_id = knext.StringParameter( + label="Resource ID", + description="""The value in the column resource.id of a table generated by a Socrata Search node. """, + default_value="", + ) + + def configure(self, configure_context): + # TODO Create combined schema + return None + + def execute(self, exec_context: knext.ExecutionContext): + import pandas as pd + import json + import pandas as pd + from sodapy import Socrata + + # Unauthenticated client only works with public data sets. Note 'None' + # in place of application token, and no username or password: + client = Socrata(self.metadata_domain, None) + + # Example authenticated client (needed for non-public datasets): + # client = Socrata(data.cdc.gov, + # MyAppToken, + # username="user@example.com", + # password="AFakePassword") + + # First 2000 results, returned as JSON from API / converted to Python list of + # dictionaries by sodapy. + results = client.get(self.resource_id, limit=100000) + + # Convert to pandas DataFrame + results_df = pd.DataFrame.from_records(results) + + return knext.Table.from_pandas(results_df) From d38ce33220871eab87e7b7046f9ffadceaa0c784 Mon Sep 17 00:00:00 2001 From: Lingbo Liu Date: Tue, 5 Mar 2024 08:05:02 -0500 Subject: [PATCH 07/11] revise socrata --- knime_extension/src/nodes/opendata.py | 66 +++++++++++++++++++++------ 1 file changed, 53 insertions(+), 13 deletions(-) diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py index e09b7dc9..102fd90a 100644 --- a/knime_extension/src/nodes/opendata.py +++ b/knime_extension/src/nodes/opendata.py @@ -892,11 +892,26 @@ def execute(self, exec_context: knext.ExecutionContext): ) @knext.output_table( name="Socrata dataset list", - description="Socrata dataset based on search keywords", + description="Socrata dataset list from a wealth of open data resources from governments, non-profits, and NGOs around the world based on the query term. ", ) class SocrataSearchNode: - """Retrive the open data category via Socrata API. - + """Access open datasets from various well-known data resources and organizations effortlessly using the SODA interface. + + US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, + injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, + epidemiological studies, and collaborative research efforts conducted by the CDC and its partners. + + Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, + healthcare, transportation, and the environment. + + Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more. + + NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city. + + UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more. + + World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends. + The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs.. This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset list. """ @@ -917,10 +932,12 @@ def execute(self, exec_context: knext.ExecutionContext): import pandas as pd import json from pandas import json_normalize + from urllib.parse import quote query_item = self.queryitem + encoded_query_item = quote(query_item) request = Request( - f"http://api.us.socrata.com/api/catalog/v1?q={query_item}&only=datasets&limit=10000" + f"http://api.us.socrata.com/api/catalog/v1?q={encoded_query_item}&only=datasets&limit=10000" ) response = urlopen(request) @@ -934,9 +951,11 @@ def execute(self, exec_context: knext.ExecutionContext): # Create a DataFrame from the dataset information, and flatten the nested dictionaries df = json_normalize(dataset_info) - df = df.drop( - columns=["classification.domain_tags", "classification.domain_metadata"] - ) + # Check if columns exist before dropping them + columns_to_drop = ["classification.domain_tags", "classification.domain_metadata"] + columns_to_drop = [col for col in columns_to_drop if col in df.columns] + df = df.drop(columns=columns_to_drop) + # Find List list_columns = [ @@ -986,8 +1005,23 @@ def execute(self, exec_context: knext.ExecutionContext): description="Socrata dataset based on search keywords", ) class SocrataDataNode: - """Retrive the open data category via Socrata API. - + """Retrieve the open data category via Socrata API. + + US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, + injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, + epidemiological studies, and collaborative research efforts conducted by the CDC and its partners. + + Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, + healthcare, transportation, and the environment. + + Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more. + + NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city. + + UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more. + + World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends. + The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs.. This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset from a dataset list generated by Socrata Search Node. @@ -1020,7 +1054,15 @@ def execute(self, exec_context: knext.ExecutionContext): # Unauthenticated client only works with public data sets. Note 'None' # in place of application token, and no username or password: client = Socrata(self.metadata_domain, None) - + limit = 100000 + offset = 0 + all_results = [] + while True: + results = client.get(self.resource_id, limit=limit, offset=offset) + if not results: + break + all_results.extend(results) + offset += limit # Example authenticated client (needed for non-public datasets): # client = Socrata(data.cdc.gov, # MyAppToken, @@ -1029,9 +1071,7 @@ def execute(self, exec_context: knext.ExecutionContext): # First 2000 results, returned as JSON from API / converted to Python list of # dictionaries by sodapy. - results = client.get(self.resource_id, limit=100000) - # Convert to pandas DataFrame - results_df = pd.DataFrame.from_records(results) + results_df = pd.DataFrame.from_records(all_results) return knext.Table.from_pandas(results_df) From da4ed57a32efe7c17ecd7be9cb77835871353a1c Mon Sep 17 00:00:00 2001 From: UrbanGISer Date: Tue, 14 Nov 2023 17:30:41 -0500 Subject: [PATCH 08/11] Add Socrata Data Nodes --- knime_extension/src/nodes/opendata.py | 66 ++++++--------------------- 1 file changed, 13 insertions(+), 53 deletions(-) diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py index 102fd90a..e09b7dc9 100644 --- a/knime_extension/src/nodes/opendata.py +++ b/knime_extension/src/nodes/opendata.py @@ -892,26 +892,11 @@ def execute(self, exec_context: knext.ExecutionContext): ) @knext.output_table( name="Socrata dataset list", - description="Socrata dataset list from a wealth of open data resources from governments, non-profits, and NGOs around the world based on the query term. ", + description="Socrata dataset based on search keywords", ) class SocrataSearchNode: - """Access open datasets from various well-known data resources and organizations effortlessly using the SODA interface. - - US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, - injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, - epidemiological studies, and collaborative research efforts conducted by the CDC and its partners. - - Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, - healthcare, transportation, and the environment. - - Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more. - - NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city. - - UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more. - - World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends. - + """Retrive the open data category via Socrata API. + The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs.. This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset list. """ @@ -932,12 +917,10 @@ def execute(self, exec_context: knext.ExecutionContext): import pandas as pd import json from pandas import json_normalize - from urllib.parse import quote query_item = self.queryitem - encoded_query_item = quote(query_item) request = Request( - f"http://api.us.socrata.com/api/catalog/v1?q={encoded_query_item}&only=datasets&limit=10000" + f"http://api.us.socrata.com/api/catalog/v1?q={query_item}&only=datasets&limit=10000" ) response = urlopen(request) @@ -951,11 +934,9 @@ def execute(self, exec_context: knext.ExecutionContext): # Create a DataFrame from the dataset information, and flatten the nested dictionaries df = json_normalize(dataset_info) - # Check if columns exist before dropping them - columns_to_drop = ["classification.domain_tags", "classification.domain_metadata"] - columns_to_drop = [col for col in columns_to_drop if col in df.columns] - df = df.drop(columns=columns_to_drop) - + df = df.drop( + columns=["classification.domain_tags", "classification.domain_metadata"] + ) # Find List list_columns = [ @@ -1005,23 +986,8 @@ def execute(self, exec_context: knext.ExecutionContext): description="Socrata dataset based on search keywords", ) class SocrataDataNode: - """Retrieve the open data category via Socrata API. - - US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, - injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, - epidemiological studies, and collaborative research efforts conducted by the CDC and its partners. - - Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, - healthcare, transportation, and the environment. - - Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more. - - NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city. - - UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more. - - World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends. - + """Retrive the open data category via Socrata API. + The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs.. This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset from a dataset list generated by Socrata Search Node. @@ -1054,15 +1020,7 @@ def execute(self, exec_context: knext.ExecutionContext): # Unauthenticated client only works with public data sets. Note 'None' # in place of application token, and no username or password: client = Socrata(self.metadata_domain, None) - limit = 100000 - offset = 0 - all_results = [] - while True: - results = client.get(self.resource_id, limit=limit, offset=offset) - if not results: - break - all_results.extend(results) - offset += limit + # Example authenticated client (needed for non-public datasets): # client = Socrata(data.cdc.gov, # MyAppToken, @@ -1071,7 +1029,9 @@ def execute(self, exec_context: knext.ExecutionContext): # First 2000 results, returned as JSON from API / converted to Python list of # dictionaries by sodapy. + results = client.get(self.resource_id, limit=100000) + # Convert to pandas DataFrame - results_df = pd.DataFrame.from_records(all_results) + results_df = pd.DataFrame.from_records(results) return knext.Table.from_pandas(results_df) From fd34763b23431c7ea561f06f8e2d3b8c4435a0e7 Mon Sep 17 00:00:00 2001 From: Lingbo Liu Date: Tue, 5 Mar 2024 08:05:02 -0500 Subject: [PATCH 09/11] revise socrata --- knime_extension/src/nodes/opendata.py | 66 +++++++++++++++++++++------ 1 file changed, 53 insertions(+), 13 deletions(-) diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py index e09b7dc9..102fd90a 100644 --- a/knime_extension/src/nodes/opendata.py +++ b/knime_extension/src/nodes/opendata.py @@ -892,11 +892,26 @@ def execute(self, exec_context: knext.ExecutionContext): ) @knext.output_table( name="Socrata dataset list", - description="Socrata dataset based on search keywords", + description="Socrata dataset list from a wealth of open data resources from governments, non-profits, and NGOs around the world based on the query term. ", ) class SocrataSearchNode: - """Retrive the open data category via Socrata API. - + """Access open datasets from various well-known data resources and organizations effortlessly using the SODA interface. + + US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, + injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, + epidemiological studies, and collaborative research efforts conducted by the CDC and its partners. + + Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, + healthcare, transportation, and the environment. + + Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more. + + NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city. + + UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more. + + World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends. + The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs.. This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset list. """ @@ -917,10 +932,12 @@ def execute(self, exec_context: knext.ExecutionContext): import pandas as pd import json from pandas import json_normalize + from urllib.parse import quote query_item = self.queryitem + encoded_query_item = quote(query_item) request = Request( - f"http://api.us.socrata.com/api/catalog/v1?q={query_item}&only=datasets&limit=10000" + f"http://api.us.socrata.com/api/catalog/v1?q={encoded_query_item}&only=datasets&limit=10000" ) response = urlopen(request) @@ -934,9 +951,11 @@ def execute(self, exec_context: knext.ExecutionContext): # Create a DataFrame from the dataset information, and flatten the nested dictionaries df = json_normalize(dataset_info) - df = df.drop( - columns=["classification.domain_tags", "classification.domain_metadata"] - ) + # Check if columns exist before dropping them + columns_to_drop = ["classification.domain_tags", "classification.domain_metadata"] + columns_to_drop = [col for col in columns_to_drop if col in df.columns] + df = df.drop(columns=columns_to_drop) + # Find List list_columns = [ @@ -986,8 +1005,23 @@ def execute(self, exec_context: knext.ExecutionContext): description="Socrata dataset based on search keywords", ) class SocrataDataNode: - """Retrive the open data category via Socrata API. - + """Retrieve the open data category via Socrata API. + + US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, + injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, + epidemiological studies, and collaborative research efforts conducted by the CDC and its partners. + + Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, + healthcare, transportation, and the environment. + + Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more. + + NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city. + + UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more. + + World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends. + The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs.. This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset from a dataset list generated by Socrata Search Node. @@ -1020,7 +1054,15 @@ def execute(self, exec_context: knext.ExecutionContext): # Unauthenticated client only works with public data sets. Note 'None' # in place of application token, and no username or password: client = Socrata(self.metadata_domain, None) - + limit = 100000 + offset = 0 + all_results = [] + while True: + results = client.get(self.resource_id, limit=limit, offset=offset) + if not results: + break + all_results.extend(results) + offset += limit # Example authenticated client (needed for non-public datasets): # client = Socrata(data.cdc.gov, # MyAppToken, @@ -1029,9 +1071,7 @@ def execute(self, exec_context: knext.ExecutionContext): # First 2000 results, returned as JSON from API / converted to Python list of # dictionaries by sodapy. - results = client.get(self.resource_id, limit=100000) - # Convert to pandas DataFrame - results_df = pd.DataFrame.from_records(results) + results_df = pd.DataFrame.from_records(all_results) return knext.Table.from_pandas(results_df) From df68fa27f5a0f17e62df93ca191462c1f1c10809 Mon Sep 17 00:00:00 2001 From: Lingbo Liu Date: Thu, 6 Feb 2025 14:24:32 -0500 Subject: [PATCH 10/11] add time out parameters --- knime_extension/src/nodes/opendata.py | 63 +++++++++++++++++---------- 1 file changed, 41 insertions(+), 22 deletions(-) diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py index 102fd90a..70c16c54 100644 --- a/knime_extension/src/nodes/opendata.py +++ b/knime_extension/src/nodes/opendata.py @@ -895,23 +895,23 @@ def execute(self, exec_context: knext.ExecutionContext): description="Socrata dataset list from a wealth of open data resources from governments, non-profits, and NGOs around the world based on the query term. ", ) class SocrataSearchNode: - """Access open datasets from various well-known data resources and organizations effortlessly using the SODA interface. - - US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, - injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, + """Access open datasets from various well-known data resources and organizations effortlessly using the SODA interface. + + US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, + injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, epidemiological studies, and collaborative research efforts conducted by the CDC and its partners. - Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, + Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, healthcare, transportation, and the environment. Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more. - + NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city. - + UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more. - + World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends. - + The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs.. This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset list. """ @@ -923,6 +923,14 @@ class SocrataSearchNode: default_value="Massachusetts", ) + timeout = knext.IntParameter( + label="Request timeout in seconds", + description="The timeout in seconds for the request API.", + default_value=120, + min_value=1, + is_advanced=True, + ) + def configure(self, configure_context): # TODO Create combined schema return None @@ -940,7 +948,7 @@ def execute(self, exec_context: knext.ExecutionContext): f"http://api.us.socrata.com/api/catalog/v1?q={encoded_query_item}&only=datasets&limit=10000" ) - response = urlopen(request) + response = urlopen(request, timeout=self.timeout) response_body = response.read() # Load the JSON response into a Python dictionary @@ -952,11 +960,13 @@ def execute(self, exec_context: knext.ExecutionContext): # Create a DataFrame from the dataset information, and flatten the nested dictionaries df = json_normalize(dataset_info) # Check if columns exist before dropping them - columns_to_drop = ["classification.domain_tags", "classification.domain_metadata"] + columns_to_drop = [ + "classification.domain_tags", + "classification.domain_metadata", + ] columns_to_drop = [col for col in columns_to_drop if col in df.columns] df = df.drop(columns=columns_to_drop) - # Find List list_columns = [ col for col in df.columns if any(isinstance(item, list) for item in df[col]) @@ -1007,21 +1017,21 @@ def execute(self, exec_context: knext.ExecutionContext): class SocrataDataNode: """Retrieve the open data category via Socrata API. - US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, - injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, + US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, + injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, epidemiological studies, and collaborative research efforts conducted by the CDC and its partners. - Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, + Data.gov: The official open data platform of the United States government, offering datasets from various U.S. government agencies covering fields such as education, healthcare, transportation, and the environment. Chicago Data Portal: The open data platform provided by the City of Chicago, offering datasets related to the city, including crime data, transportation data, demographic statistics, and more. - + NYC Open Data: The open data platform provided by the City of New York, offering datasets covering urban planning, public transportation, health, and various other aspects of the city. - + UK Government Data Service: The open data platform provided by the UK government, offering datasets from various governmental bodies covering economics, social issues, the environment, and more. - - World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends. - + + World Bank Data: The open data platform provided by the World Bank, offering a wide range of economic, social, and environmental datasets from around the world for research and analysis of global development trends. + The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs.. This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset from a dataset list generated by Socrata Search Node. @@ -1041,7 +1051,15 @@ class SocrataDataNode: default_value="", ) - def configure(self, configure_context): + timeout = knext.IntParameter( + label="Request timeout in seconds", + description="The timeout in seconds for the request API.", + default_value=120, + min_value=1, + is_advanced=True, + ) + + def configure(self, configure_context, input_schema_1): # TODO Create combined schema return None @@ -1054,7 +1072,8 @@ def execute(self, exec_context: knext.ExecutionContext): # Unauthenticated client only works with public data sets. Note 'None' # in place of application token, and no username or password: client = Socrata(self.metadata_domain, None) - limit = 100000 + client.timeout = self.timeout + limit = 100000 offset = 0 all_results = [] while True: From 1049e56cc0d638e661eb0004656512cea60815d0 Mon Sep 17 00:00:00 2001 From: Tobias Koetter Date: Fri, 28 Mar 2025 17:10:43 +0100 Subject: [PATCH 11/11] Fix bug in configure of Socrata Data Query node --- knime_extension/src/nodes/opendata.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/knime_extension/src/nodes/opendata.py b/knime_extension/src/nodes/opendata.py index 70c16c54..fe2de0ce 100644 --- a/knime_extension/src/nodes/opendata.py +++ b/knime_extension/src/nodes/opendata.py @@ -892,10 +892,10 @@ def execute(self, exec_context: knext.ExecutionContext): ) @knext.output_table( name="Socrata dataset list", - description="Socrata dataset list from a wealth of open data resources from governments, non-profits, and NGOs around the world based on the query term. ", + description="Socrata dataset list from a wealth of open data resources from governments, non-profits, and NGOs around the world based on the query term.", ) class SocrataSearchNode: - """Access open datasets from various well-known data resources and organizations effortlessly using the SODA interface. + """Access open datasets from various well-known data resources and organizations effortlessly using the SODA interface. US Centers for Disease Control and Prevention (CDC): CDC data includes information on infectious diseases, chronic conditions, environmental health hazards, injury prevention, maternal and child health, immunization coverage, and much more. These datasets are collected through surveillance systems, population surveys, @@ -1059,7 +1059,7 @@ class SocrataDataNode: is_advanced=True, ) - def configure(self, configure_context, input_schema_1): + def configure(self, configure_context): # TODO Create combined schema return None