jvaleroliet · jvaleroliet · Jul 16, 2024 · Jul 5, 2024 · Jul 9, 2024 · Jul 9, 2024
diff --git a/README.md b/README.md
@@ -1,6 +1,85 @@
-# pyopendataes
-The aim of this project is to create a Python library to access data from datos.gob.es, inspired in the R library [opendataes](https://github.com/ropenspain/opendataes/).
 
-The library is under construction. Feel free to collaborate.
+# datosgobes
 
+## Introduction
+datosgobes is a python library that provides a simple interface to access the open data API of [https://datos.gob.es/es/](https://datos.gob.es/es/). The package is designed to make it easy for users to retrieve and analyze data from the API.
+
+It is inspired in the R library [rOpenSpain/opendataes](https://github.com/rOpenSpain/opendataes).
+
+## Installation
+
+You can install datosgobes using pip:
+
+```python
+pip install datosgobes
+```
+
+
+## Usage
+Once installed, you can start using datosgobes by importing it in your python script:
+
+```python
+import datosgobes 
+```
+
+First, you need to initialize the manager:
+
+```python
+manager = datosgobes.Manager()
+```
+
+Then, you can use the manager to search for datasets:
+
+```python
+datasets = manager.search_datasets('sanidad')
+```
+
+You can also retrieve a dataset by its identifier:
+
+```python
+dataset = manager.get_dataset('l01080193-resultados-absolutos-de-las-elecciones-al-parlamento-europeo-de-la-ciudad-de-barcelona')
+```
+
+Once you have a dataset, you can retrieve its metadata, title, description...:
+
+```python
+print(dataset.title)
+print(dataset.description)
+```
+
+Each dataset can contain multiple distributions. They are stored in a list called `distributions`:
+
+```python
+print(dataset.distributions)
+```
+
+You can retrieve the data from a distribution by its list index:
+
+```python
+distribution = dataset.distribution[0]
+```
+
+The distribution is a class that contains information about the data, such as its title, description, format...:
+
+```python
+print(distribution.title)
+print(distribution.description)
+print(distribution.format)
+```
+Finally, you can retrieve the data from the distribution:
+
+```python
+data = distribution.data
+```
+
+The data is returned as a pandas DataFrame, if the format is recognized, or as a raw string otherwise.
+You can use it as you wish.
+
+## Collaboration
+
+If you want to contribute to the project, you can fork the repository and submit a pull request. Please make sure to follow the coding standards and documentation guidelines.
+
+## License
+
+datosgobes is released under the MIT license.
 
diff --git a/datosgobes/__init__.py b/datosgobes/__init__.py
@@ -0,0 +1 @@
+from .manager import Manager
diff --git a/datosgobes/data_download.py b/datosgobes/data_download.py
@@ -0,0 +1,58 @@
+import pandas as pd
+import requests
+from io import BytesIO
+
+def download_data(url, output_file=None):
+
+    """
+    Downloads the data from the provided URL and optionally saves it to a file or attempts to load it as a pandas DataFrame.
+
+    Args:
+        url (str): The URL of the data to download.
+        output_file (str, optional): Path to the file where downloaded data should be saved.
+            Defaults to None.
+
+    Returns:
+        pandas.DataFrame | bytes | None: The loaded DataFrame on success, raw bytes on unknown format, or None on errors.
+    """
+    try:
+        response = requests.get(url, allow_redirects=True, stream=True)
+
+        if response.status_code == 200:
+            # Check for content-type header
+            content_type = response.headers.get('Content-Type')
+
+            if output_file:
+                with open(output_file, 'wb') as f:
+                    for chunk in response.iter_content(1024):
+                        f.write(chunk)
+                return None  # Indicate successful download to file
+
+            else:
+                # Try loading data as pandas DataFrame based on content type and potential extensions
+                try:
+                    if content_type:                        
+                        if content_type.startswith('text/csv'):
+                            encodings = ['utf-8', 'latin-1', 'cp1252', 'utf-16']  # Common encodings to try
+                            for encoding in encodings:
+                                try:
+                                    return pd.read_csv(url, engine='python', encoding=encoding)
+                                except:
+                                    pass  # Ignore parsing errors and try next encoding
+                        elif content_type.startswith('application/json'):
+                            # Likely JSON format
+                            return pd.read_json(response.content)
+                        elif content_type.startswith('application/vnd.ms-excel'):
+                            # Likely Excel format, attempt loading with xlrd
+                            return pd.read_excel(url, engine='xlrd')
+                        elif content_type.startswith('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'):
+                            # Likely Excel format, attempt loading with openpyxl
+                            return pd.read_excel(url, engine='openpyxl')
+                        else:
+                            # Unknown format, return raw bytes
+                            return response.content
+
+                except:
+                    print("Failed to parse data as pandas DataFrame.")
+    except:
+        print("Failed to download data from URL.")            
diff --git a/datosgobes/manager.py b/datosgobes/manager.py
@@ -0,0 +1,98 @@
+import requests
+import pandas as pd
+from .opendataset import OpenDataSet
+
+class Manager:
+    def __init__(self) -> None:
+        self.url = "http://datos.gob.es/apidata" 
+        self._search_result = None   
+
+    def _list_datasets(self, start_page=0, pages_limit=1):
+        """Get the collection of datasets from the portal. The collection is huge, so be sure to limit the number of pages to download.
+
+        Args:
+            start_page (int, optional): Page to start the download from. Defaults to 0.
+            pages_limit (int, optional): Limit of pages to download. Defaults to 1.
+
+        Returns:
+            list of the metadata of the datasets.
+        """        
+        start_url = f'{self.url}/catalog/dataset.json?_page={start_page}'	
+        all_datasets = []
+        i = start_page
+        while start_url and i<pages_limit+start_page:
+            # Download the page
+            response = requests.get(start_url)
+            data = response.json()
+
+            # Extract and normalize the datasets
+            page_datasets = data["result"]["items"]
+
+            # Add the datasets to the collection
+            all_datasets += page_datasets
+
+            # Obtain the URL for the next page
+            next_page_url = data["result"]["next"]
+            start_url = next_page_url if next_page_url else None
+            if pages_limit != None:
+                i+=1
+        return all_datasets
+
+    def _query_datasets(self, query: str, start_page=0, pages_limit=1):
+        """Get the collection of datasets from the portal based on a string query. The collection is huge, so be sure to limit the number of pages to download.
+
+        Args:
+            query (str): String to search in the datasets.
+            start_page (int, optional): Page to start the download from. Defaults to 0.
+            pages_limit (int, optional): Limit of pages to download. Defaults to 1.
+
+        Returns:
+            list of the metadata of the datasets.
+        """        
+        start_url = f'{self.url}/catalog/dataset/title/{query}.json?_page={start_page}'	
+        all_datasets = []
+        i = start_page
+        while start_url and i<pages_limit+start_page:
+            # Download the page
+            response = requests.get(start_url)
+            data = response.json()
+
+            # Extract and normalize the datasets
+            page_datasets = data["result"]["items"]
+
+            # Add the datasets to the collection
+            all_datasets += page_datasets
+
+            # Obtain the URL for the next page
+            next_page_url = data["result"]["next"]
+            start_url = next_page_url if next_page_url else None
+            if pages_limit != None:
+                i+=1
+        return all_datasets
+
+
+    def get_datasets(self, start_page=0, pages_limit=1) -> list:
+        if not self._search_result:
+            self._search_result = self._list_datasets(start_page, pages_limit)
+
+        datasets = []
+        for dataset in self._search_result:
+            opendataset = self._create_dataset(dataset)
+            datasets.append(opendataset)
+        return datasets
+
+    def search_datasets(self, query: str, start_page=0, pages_limit=1) -> list:
+        self._search_result = self._query_datasets(query, start_page, pages_limit)
+        datasets = []
+        for dataset in self._search_result:
+            opendataset = self._create_dataset(dataset)
+            datasets.append(opendataset)
+        return datasets
+
+    def _create_dataset(self, dataset_meta: dict) -> OpenDataSet:
+        dataset = OpenDataSet(url=dataset_meta["_about"])
+        return dataset
+
+    def get_dataset(self, id: str) -> OpenDataSet:
+        dataset = OpenDataSet(url=f'{self.url}/catalog/dataset/{id}')
+        return dataset
diff --git a/datosgobes/opendataset.py b/datosgobes/opendataset.py
@@ -0,0 +1,118 @@
+import pandas as pd
+import numpy as np
+import requests
+from .data_download import download_data
+
+
+class OpenDataSet:
+    def __init__(self, url:str):
+        self.url = url
+        self.id = url.split('/')[-1]
+        # self.metadata = {}
+
+    def __repr__(self):
+        return f"OpenDataSet('{self.id}')"
+
+    @property    
+    def metadata(self):
+        # Fetch metadata from the API
+        response = requests.get(f"http://datos.gob.es/apidata/catalog/dataset/{self.id}")
+        return response.json()["result"]["items"][0]
+
+
+    @property
+    def publisher_data_url(self):
+        return self.metadata.get('identifier')
+
+    @property
+    def title(self):
+        # Extract all title values based on language code
+        titles = {d['_lang']: d['_value'] for d in self.metadata.get('title', [])}
+        return titles
+
+    @property
+    def description(self):
+        # Extract the first description value for each language
+        descriptions = {d['_lang']: d['_value'] for d in self.metadata.get('description', [])}
+        return descriptions
+
+    @property
+    def keywords(self):
+        # Group keywords by language and remove duplicates
+        keywords = {}
+        for entry in self.metadata.get('keyword', []):
+            lang_code = entry['_lang']
+            keyword = entry['_value']
+            keywords.setdefault(lang_code, set()).add(keyword)
+        return {lang: list(values) for lang, values in keywords.items()}
+
+    @property
+    def distributions(self):
+        """
+        Returns a list of Distribution objects, each containing information for a single distribution.
+        """
+        distributions = self.metadata.get('distribution', [])
+        distribution_objects = []
+        for distribution_data in distributions:
+            distribution_objects.append(Distribution(distribution_data))
+        return distribution_objects
+
+    def get_distribution_by_format(self, format):
+        """
+        Returns the distribution information for a specific format (e.g., text/csv).
+        """
+        matched_distributions = []
+        for distribution in self.distributions:
+            if distribution.format == format:
+                matched_distributions.append(distribution)
+        return matched_distributions
+
+
+class Distribution:
+    def __init__(self, metadata):
+        self.metadata = metadata
+
+    def __repr__(self):
+        return f"Distribution(accessURL={self.access_url}, format={self.format}, byte_size={self.byte_size}, titles={self.titles})"
+
+
+    @property
+    def access_url(self):
+        return self.metadata.get('accessURL')
+
+    @property
+    def byte_size(self):
+        return self.metadata.get('byteSize')
+
+    @property
+    def format(self):
+        return self.metadata.get('format')['value']
+
+    @property
+    def titles(self):
+        # Extract title information for all languages efficiently
+        return {d['_lang']: d['_value'] for d in self.metadata.get('title', [])}
+
+    def download_data(self, output_file=None):
+        """
+        Downloads the data from the distribution URL and optionally saves it to a file or attempts to load it as a pandas DataFrame.
+
+        This method utilizes the download_data function from the data_download module.
+
+        Args:
+            output_file (str, optional): Path to the file where downloaded data should be saved.
+                Defaults to None.
+
+        Returns:
+            pandas.DataFrame | bytes | None: The loaded DataFrame on success, raw bytes on unknown format, or None on errors.
+        """
+        return download_data(self.access_url, output_file)
+
+    @property
+    def data(self):
+        """
+        Returns the downloaded data as a pandas DataFrame, raw bytes, or None if not downloaded yet.
+        """
+        if self.access_url:
+            return self.download_data()  # Download data if not already downloaded
+        return None
diff --git a/setup.py b/setup.py
@@ -0,0 +1,18 @@
+from setuptools import setup, find_packages
+
+setup(
+    name="datosgobes",
+    description="Python package to access Spanish Government Open Data from the datos.gob.es API",
+    version="0.1.0",
+    author="Juan Valero",
+    author_email="olietvalero@gmail.com",
+    url="https://github.com/jvaleroliet/datosgobes",
+    license="MIT",
+    packages=find_packages(),
+    install_requires=[
+        "pandas>=2.2.0",
+        "requests>=2.31.0",
+        "numpy==1.26.4"
+    ],
+    python_requires=">=3.9",
+)