diff --git a/README.md b/README.md index fb5051c..82cc98f 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,85 @@ -# pyopendataes -The aim of this project is to create a Python library to access data from datos.gob.es, inspired in the R library [opendataes](https://github.com/ropenspain/opendataes/). -The library is under construction. Feel free to collaborate. +# datosgobes +## Introduction +datosgobes is a python library that provides a simple interface to access the open data API of [https://datos.gob.es/es/](https://datos.gob.es/es/). The package is designed to make it easy for users to retrieve and analyze data from the API. + +It is inspired in the R library [rOpenSpain/opendataes](https://github.com/rOpenSpain/opendataes). + +## Installation + +You can install datosgobes using pip: + +```python +pip install datosgobes +``` + + +## Usage +Once installed, you can start using datosgobes by importing it in your python script: + +```python +import datosgobes +``` + +First, you need to initialize the manager: + +```python +manager = datosgobes.Manager() +``` + +Then, you can use the manager to search for datasets: + +```python +datasets = manager.search_datasets('sanidad') +``` + +You can also retrieve a dataset by its identifier: + +```python +dataset = manager.get_dataset('l01080193-resultados-absolutos-de-las-elecciones-al-parlamento-europeo-de-la-ciudad-de-barcelona') +``` + +Once you have a dataset, you can retrieve its metadata, title, description...: + +```python +print(dataset.title) +print(dataset.description) +``` + +Each dataset can contain multiple distributions. They are stored in a list called `distributions`: + +```python +print(dataset.distributions) +``` + +You can retrieve the data from a distribution by its list index: + +```python +distribution = dataset.distribution[0] +``` + +The distribution is a class that contains information about the data, such as its title, description, format...: + +```python +print(distribution.title) +print(distribution.description) +print(distribution.format) +``` +Finally, you can retrieve the data from the distribution: + +```python +data = distribution.data +``` + +The data is returned as a pandas DataFrame, if the format is recognized, or as a raw string otherwise. +You can use it as you wish. + +## Collaboration + +If you want to contribute to the project, you can fork the repository and submit a pull request. Please make sure to follow the coding standards and documentation guidelines. + +## License + +datosgobes is released under the MIT license. diff --git a/datosgobes/__init__.py b/datosgobes/__init__.py new file mode 100644 index 0000000..1c1738e --- /dev/null +++ b/datosgobes/__init__.py @@ -0,0 +1 @@ +from .manager import Manager diff --git a/datosgobes/data_download.py b/datosgobes/data_download.py new file mode 100644 index 0000000..abdf24a --- /dev/null +++ b/datosgobes/data_download.py @@ -0,0 +1,58 @@ +import pandas as pd +import requests +from io import BytesIO + +def download_data(url, output_file=None): + + """ + Downloads the data from the provided URL and optionally saves it to a file or attempts to load it as a pandas DataFrame. + + Args: + url (str): The URL of the data to download. + output_file (str, optional): Path to the file where downloaded data should be saved. + Defaults to None. + + Returns: + pandas.DataFrame | bytes | None: The loaded DataFrame on success, raw bytes on unknown format, or None on errors. + """ + try: + response = requests.get(url, allow_redirects=True, stream=True) + + if response.status_code == 200: + # Check for content-type header + content_type = response.headers.get('Content-Type') + + if output_file: + with open(output_file, 'wb') as f: + for chunk in response.iter_content(1024): + f.write(chunk) + return None # Indicate successful download to file + + else: + # Try loading data as pandas DataFrame based on content type and potential extensions + try: + if content_type: + if content_type.startswith('text/csv'): + encodings = ['utf-8', 'latin-1', 'cp1252', 'utf-16'] # Common encodings to try + for encoding in encodings: + try: + return pd.read_csv(url, engine='python', encoding=encoding) + except: + pass # Ignore parsing errors and try next encoding + elif content_type.startswith('application/json'): + # Likely JSON format + return pd.read_json(response.content) + elif content_type.startswith('application/vnd.ms-excel'): + # Likely Excel format, attempt loading with xlrd + return pd.read_excel(url, engine='xlrd') + elif content_type.startswith('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'): + # Likely Excel format, attempt loading with openpyxl + return pd.read_excel(url, engine='openpyxl') + else: + # Unknown format, return raw bytes + return response.content + + except: + print("Failed to parse data as pandas DataFrame.") + except: + print("Failed to download data from URL.") diff --git a/datosgobes/manager.py b/datosgobes/manager.py new file mode 100644 index 0000000..06ee301 --- /dev/null +++ b/datosgobes/manager.py @@ -0,0 +1,98 @@ +import requests +import pandas as pd +from .opendataset import OpenDataSet + +class Manager: + def __init__(self) -> None: + self.url = "http://datos.gob.es/apidata" + self._search_result = None + + def _list_datasets(self, start_page=0, pages_limit=1): + """Get the collection of datasets from the portal. The collection is huge, so be sure to limit the number of pages to download. + + Args: + start_page (int, optional): Page to start the download from. Defaults to 0. + pages_limit (int, optional): Limit of pages to download. Defaults to 1. + + Returns: + list of the metadata of the datasets. + """ + start_url = f'{self.url}/catalog/dataset.json?_page={start_page}' + all_datasets = [] + i = start_page + while start_url and i list: + if not self._search_result: + self._search_result = self._list_datasets(start_page, pages_limit) + + datasets = [] + for dataset in self._search_result: + opendataset = self._create_dataset(dataset) + datasets.append(opendataset) + return datasets + + def search_datasets(self, query: str, start_page=0, pages_limit=1) -> list: + self._search_result = self._query_datasets(query, start_page, pages_limit) + datasets = [] + for dataset in self._search_result: + opendataset = self._create_dataset(dataset) + datasets.append(opendataset) + return datasets + + def _create_dataset(self, dataset_meta: dict) -> OpenDataSet: + dataset = OpenDataSet(url=dataset_meta["_about"]) + return dataset + + def get_dataset(self, id: str) -> OpenDataSet: + dataset = OpenDataSet(url=f'{self.url}/catalog/dataset/{id}') + return dataset \ No newline at end of file diff --git a/datosgobes/opendataset.py b/datosgobes/opendataset.py new file mode 100644 index 0000000..4c96de1 --- /dev/null +++ b/datosgobes/opendataset.py @@ -0,0 +1,118 @@ +import pandas as pd +import numpy as np +import requests +from .data_download import download_data + + +class OpenDataSet: + def __init__(self, url:str): + self.url = url + self.id = url.split('/')[-1] + # self.metadata = {} + + def __repr__(self): + return f"OpenDataSet('{self.id}')" + + @property + def metadata(self): + # Fetch metadata from the API + response = requests.get(f"http://datos.gob.es/apidata/catalog/dataset/{self.id}") + return response.json()["result"]["items"][0] + + + @property + def publisher_data_url(self): + return self.metadata.get('identifier') + + @property + def title(self): + # Extract all title values based on language code + titles = {d['_lang']: d['_value'] for d in self.metadata.get('title', [])} + return titles + + @property + def description(self): + # Extract the first description value for each language + descriptions = {d['_lang']: d['_value'] for d in self.metadata.get('description', [])} + return descriptions + + @property + def keywords(self): + # Group keywords by language and remove duplicates + keywords = {} + for entry in self.metadata.get('keyword', []): + lang_code = entry['_lang'] + keyword = entry['_value'] + keywords.setdefault(lang_code, set()).add(keyword) + return {lang: list(values) for lang, values in keywords.items()} + + @property + def distributions(self): + """ + Returns a list of Distribution objects, each containing information for a single distribution. + """ + distributions = self.metadata.get('distribution', []) + distribution_objects = [] + for distribution_data in distributions: + distribution_objects.append(Distribution(distribution_data)) + return distribution_objects + + def get_distribution_by_format(self, format): + """ + Returns the distribution information for a specific format (e.g., text/csv). + """ + matched_distributions = [] + for distribution in self.distributions: + if distribution.format == format: + matched_distributions.append(distribution) + return matched_distributions + + +class Distribution: + def __init__(self, metadata): + self.metadata = metadata + + def __repr__(self): + return f"Distribution(accessURL={self.access_url}, format={self.format}, byte_size={self.byte_size}, titles={self.titles})" + + + @property + def access_url(self): + return self.metadata.get('accessURL') + + @property + def byte_size(self): + return self.metadata.get('byteSize') + + @property + def format(self): + return self.metadata.get('format')['value'] + + @property + def titles(self): + # Extract title information for all languages efficiently + return {d['_lang']: d['_value'] for d in self.metadata.get('title', [])} + + def download_data(self, output_file=None): + """ + Downloads the data from the distribution URL and optionally saves it to a file or attempts to load it as a pandas DataFrame. + + This method utilizes the download_data function from the data_download module. + + Args: + output_file (str, optional): Path to the file where downloaded data should be saved. + Defaults to None. + + Returns: + pandas.DataFrame | bytes | None: The loaded DataFrame on success, raw bytes on unknown format, or None on errors. + """ + return download_data(self.access_url, output_file) + + @property + def data(self): + """ + Returns the downloaded data as a pandas DataFrame, raw bytes, or None if not downloaded yet. + """ + if self.access_url: + return self.download_data() # Download data if not already downloaded + return None \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..f3834bd --- /dev/null +++ b/setup.py @@ -0,0 +1,18 @@ +from setuptools import setup, find_packages + +setup( + name="datosgobes", + description="Python package to access Spanish Government Open Data from the datos.gob.es API", + version="0.1.0", + author="Juan Valero", + author_email="olietvalero@gmail.com", + url="https://github.com/jvaleroliet/datosgobes", + license="MIT", + packages=find_packages(), + install_requires=[ + "pandas>=2.2.0", + "requests>=2.31.0", + "numpy==1.26.4" + ], + python_requires=">=3.9", +)