Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Use Tabor for DBBackups #77

Merged
merged 3 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions .gitmodules

This file was deleted.

3 changes: 3 additions & 0 deletions Dockerfiles/Dockerfile.geocml-desktop
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ RUN ansible-galaxy collection install ansible.posix && ansible-galaxy install -r
######### Customize Container Here ###########
######### End Customizations ###########

# Uninstall Ansible stuff
RUN rm -rf $HOME/.ansible && apt purge -y ansible*

# Remove install cache
RUN apt clean autoclean && apt autoremove -y && rm -rf /var/lib/{apt,dpkg,cache,log}/

Expand Down
32 changes: 24 additions & 8 deletions Dockerfiles/Dockerfile.geocml-task-scheduler
Original file line number Diff line number Diff line change
@@ -1,21 +1,37 @@
FROM ubuntu:22.04

USER root
ARG DEBIAN_FRONTEND=noninteractive
RUN apt update -y
RUN apt install -y software-properties-common
RUN add-apt-repository ppa:deadsnakes/ppa && apt update -y

# Install Python and Pip
RUN apt install -y python3-pip && pip install psycopg2-binary
# Install Python3.12 and Pip
RUN apt install -y python3.12 python3-pip

# Install Ansible dependencies
RUN apt install -y git python3.12-venv python3.12-dev

# Install psycopg2-binary
RUN pip install psycopg2-binary
RUN pip3 install psycopg2-binary

# Install Ansible
RUN apt install -y ansible

# Create task_log file
RUN touch /task_log
# Copy gTS build resources to the container
COPY ./build-resources/geocml-task-scheduler/ /geocml-task-scheduler

# Copy gTS to the container
COPY ./build-resources/geocml-task-scheduler/geocml-task-scheduler/ /geocml-task-scheduler
# Install Ansible dependencies and run through playbook
COPY ./ansible-playbooks/geocml-task-scheduler-requirements.yaml ./ansible-playbooks/geocml-task-scheduler-playbook.yaml ./
RUN ansible-galaxy collection install ansible.posix && ansible-galaxy install -r geocml-task-scheduler-requirements.yaml && ansible-playbook -i,localhost geocml-task-scheduler-playbook.yaml --tags "all" && rm -f ./*.yaml

######### Customize Container Here ###########
######### End Customizations ###########

CMD python3 /geocml-task-scheduler/schedule.py
# Uninstall Ansible stuff
RUN rm -rf $HOME/.ansible && apt purge -y ansible* && apt purge -y git* && apt purge -y virtualenv*

# Remove install cache
RUN apt clean autoclean && apt autoremove -y && rm -rf /var/lib/{apt,dpkg,cache,log}/

CMD python3 /geocml-task-scheduler/geocml-task-scheduler/schedule.py
34 changes: 34 additions & 0 deletions ansible-playbooks/geocml-task-scheduler-playbook.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
---
- # Configure geocml-task-scheduler
hosts: localhost
connection: local
gather_facts: yes
become: yes
tasks:
- name: Create task_log file
ansible.builtin.command: touch /task_log
- name: Clone Tabor source
ansible.builtin.command:
chdir: /geocml-task-scheduler
cmd: git clone https://github.com/geoCML/tabor.git
- name: Update Tabor submodule to latest release tag
shell: |
cd /geocml-task-scheduler/tabor
git fetch --tags
TAG=$(git tag | tail -1)
git checkout $TAG
- name: Create python venv for Tabor build
ansible.builtin.command:
chdir: /geocml-task-scheduler/tabor
cmd: python3.12 -m venv ./venv
- name: Install Tabor requirements
ansible.builtin.pip:
chdir: /geocml-task-scheduler/tabor
virtualenv: ./venv
requirements: ./requirements.txt
- name: Build Tabor
shell: |
cd /geocml-task-scheduler/tabor
source ./venv/bin/activate && python3.12 -m pip install -U pyinstaller==6.9.0 && pyinstaller --paths=./src -y ./src/tabor.py
args:
executable: /bin/bash
12 changes: 12 additions & 0 deletions ansible-playbooks/geocml-task-scheduler-requirements.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
# Add Ansible Galaxy Packages here, role_example_hello included to prevent errors in template testing

# NOTE: Uncomment the two lines below if you want to test your Ansible installation
# before proceeding with the rest of the playbook.
#
# This is typically only necessary if you are changing the default Ansible installation
# that comes with the geoCML base image.
#roles:
#- irixjp.role_example_hello # https://galaxy.ansible.com/irixjp/role_example_hello
collections:
- community.general
Original file line number Diff line number Diff line change
@@ -1,88 +1,67 @@
import psycopg2
import os
import subprocess
from time import time
from task_logger import log

ignore_tables = ('spatial_ref_sys', 'geometry_columns', 'geography_columns')
ignore_schemas = ('pg_catalog', 'information_schema')
ignore_tables = ("spatial_ref_sys", "geometry_columns", "geography_columns") # TODO: https://github.com/geoCML/tabor/issues/7
ignore_schemas = ("pg_catalog", "information_schema")

def backup_geocml_db():
try:
conn = psycopg2.connect(dbname='geocml_db',
user='postgres',
password='admin',
host='geocml-postgres',
conn = psycopg2.connect(dbname="geocml_db",
user="geocml",
password="geocml",
host="geocml-postgres",
port=5432)
except psycopg2.OperationalError:
log('Couldn\'t connect to geocml_db; is the postgresql service started?')
log("Couldn\'t connect to geocml_db; is the postgresql service started?")
return

cursor = conn.cursor()
cursor.execute('SELECT DISTINCT table_schema FROM information_schema.tables;')
schemas = cursor.fetchall()
back_up_timestamp = time()
path_to_backup_dir = os.path.join(os.sep, 'DBBackups', str(back_up_timestamp))
path_to_backup_dir = os.path.join(os.sep, "DBBackups", str(back_up_timestamp))
os.mkdir(path_to_backup_dir)
delete_backup_dir = True


# Write table schemata to .tabor file
out = subprocess.run(["tabor", "write", "--db", "geocml_db",
"--username", "postgres", "--password", "admin",
"--host", "geocml-postgres",
"--file", os.path.join(path_to_backup_dir, "geocml_db.tabor")],
capture_output=True)

if out.stderr:
log("Failed to generate .tabor file {}".format(out.stderr))
os.rmdir(path_to_backup_dir)
return

cursor = conn.cursor()
cursor.execute("""SELECT DISTINCT table_schema FROM information_schema.tables;""")
schemas = cursor.fetchall()

# Write table data to CSV file
for schema in schemas:
if schema[0] in ignore_schemas:
continue
cursor.execute('SELECT * FROM information_schema.tables WHERE table_schema = \'{}\';'
.format(schema[0]))
tables = cursor.fetchall()

cursor.execute(f"""SELECT * FROM information_schema.tables WHERE table_schema = '{schema[0]}';""")

tables = cursor.fetchall()

for table in tables:
if table[2] in ignore_tables:
continue

delete_backup_dir = False

# Write to schema file
schema_file_path = os.path.join(path_to_backup_dir, 'schema:{}.{}.sql'.format(schema[0], table[2]))
schema_file = open(schema_file_path, 'w')

if not schema[0] == 'public':
cursor.execute('SELECT DISTINCT grantee FROM information_schema.role_table_grants WHERE table_schema = \'{}\';'
.format(schema[0]))
schema_owner = cursor.fetchall()
schema_file.write('CREATE SCHEMA IF NOT EXISTS {} AUTHORIZATION {};\n'
.format(schema[0], schema_owner[0][0]))

cursor.execute('SELECT pg_get_constraintdef(oid) FROM pg_constraint WHERE contype = \'p\' AND conrelid::regclass::text LIKE \'%{}%\';'.format(table[2]))

pk = cursor.fetchall()

cursor.execute('SELECT column_name, udt_name FROM information_schema.columns WHERE table_name = \'{}\';'
.format(table[2]))

columns_and_datatypes = []
for row in cursor:
if len(row) == 3: # column has a constraint
columns_and_datatypes.append('{} {} {}'.format(row[0], row[1], row[2]))
else:
columns_and_datatypes.append('{} {}'.format(row[0], row[1]))
columns_and_datatypes = ', '.join(columns_and_datatypes)

if len(pk) > 0: # table has primary key (expected)
schema_file.write('CREATE TABLE IF NOT EXISTS {}."{}" ({}, {});\n'.format(schema[0], table[2], columns_and_datatypes, pk[0][0]))
else:
schema_file.write('CREATE TABLE IF NOT EXISTS {}."{}" ({});\n'.format(schema[0], table[2], columns_and_datatypes))

cursor.execute('SELECT tableowner FROM pg_tables WHERE tablename = \'{}\';'.format(table[2]))
table_owner = cursor.fetchall()

schema_file.write('ALTER TABLE {}."{}" OWNER TO {};'.format(schema[0], table[2], table_owner[0][0]))
schema_file.close()

# Write to data file
data_file_path = os.path.join(path_to_backup_dir, 'data:{}.{}.csv'.format(schema[0], table[2]))
data_file = open(data_file_path, 'w')
cursor.copy_expert('COPY {}."{}" TO STDOUT WITH (FORMAT csv, DELIMITER \',\', HEADER FALSE);'.format(schema[0], table[2]), data_file)
data_file_path = os.path.join(path_to_backup_dir, "data:{}.{}.csv".format(schema[0], table[2]))
data_file = open(data_file_path, "w")
cursor.copy_expert(f"""COPY {schema[0]}."{table[2]}" TO STDOUT WITH (FORMAT csv, DELIMITER ',', HEADER);""", data_file)
data_file.close()

if delete_backup_dir: # nothing to back up
path_to_backup_dir.rmdir()
log("Nothing to backup")
os.rmdir(path_to_backup_dir)

cursor.close()
conn.close()
Original file line number Diff line number Diff line change
@@ -1,55 +1,72 @@
import ast
from io import StringIO
import psycopg2
import os
import subprocess
from time import time
from task_logger import log

def restore_geocml_db_from_backups():
try:
conn = psycopg2.connect(dbname='geocml_db',
user='postgres',
password='admin',
host='geocml-postgres',
conn = psycopg2.connect(dbname="geocml_db",
user="postgres",
password="admin",
host="geocml-postgres",
port=5432)
except psycopg2.OperationalError:
log('Couldn\'t connect to geocml_db; is the postgresql service started?')
log("Couldn\'t connect to geocml_db; is the postgresql service started?")
return

db_backups_dir = os.path.join(os.sep, 'DBBackups')
db_backups_dir = os.path.join(os.sep, "DBBackups")
now = time()
delta = float('inf')
most_recent_backup = ''
delta = float("inf")
most_recent_backup = ""
for subdir in os.walk(db_backups_dir):
try:
subdir_timestamp = float(subdir[0].split('/')[-1])
subdir_timestamp = float(subdir[0].split("/")[-1])
if now - subdir_timestamp < delta:
delta = now - subdir_timestamp
most_recent_backup = subdir[0]
except ValueError:
if not subdir[0] == db_backups_dir:
log('Found something unexpected in backup directory, skipping over: {}'.format(subdir[0]))
log("Found something unexpected in backup directory, skipping over: {}".format(subdir[0]))

if most_recent_backup == '':
log('No recent backups found. Aborting restoration process.')
if most_recent_backup == "":
log("No recent backups found. Aborting restoration process.")
return 0

log('Restoring geocml_db from {}'.format(most_recent_backup))
cursor = conn.cursor()
for sql_schema_file in os.listdir(most_recent_backup): # rebuild table schema
if sql_schema_file.split(':')[0] == 'schema':
log('Found SQL schema file {}'.format(sql_schema_file))
cursor.execute(open('{}/{}'.format(most_recent_backup, sql_schema_file), 'r').read())
log("Restoring geocml_db from {}".format(most_recent_backup))

cursor = conn.cursor()

# Rebuild tables from .tabor file

out = subprocess.run(["tabor", "read", "--file", os.path.join(most_recent_backup, "geocml_db.tabor")],
capture_output=True)

if out.stderr:
log("Failed to read .tabor file {}".format(out.stderr))

psql_data = ast.literal_eval(out.stdout.decode())

for table, psql_queries in psql_data.items():
log("Restoring table: {}".format(table))
for _, value in psql_queries.items():
cursor.execute(value)

conn.commit() # commit schema changes to the database before loading data from the CSV
log("Tables restored!")

for csv_data_file in os.listdir(most_recent_backup): # load data from CSV backups
file_name_split = csv_data_file.split(':')

if file_name_split[0] == 'data':
log('Found CSV data file {}'.format(csv_data_file))
file_name_split = file_name_split[1].split('.')
data_file = open(os.path.join(db_backups_dir, most_recent_backup, csv_data_file), 'r')
cursor.copy_expert('COPY {}."{}" FROM STDIN DELIMITER \',\' CSV HEADER;'
.format(file_name_split[0], file_name_split[1]), data_file)
file_name_split = csv_data_file.split(":")

if file_name_split[0] == "data":
log("Found CSV data file {}".format(csv_data_file))
file_name_split = file_name_split[1].split(".")
data_file = open(os.path.join(db_backups_dir, most_recent_backup, csv_data_file), "r").readlines()
cursor.copy_from(StringIO("".join(data_file[1::])), f"{file_name_split[1]}", sep=",",
columns=tuple(data_file[0].replace("\n", "").split(",")))
log("Finished loading data!")

conn.commit()
cursor.close()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@
restore_geocml_db_task.start()

while True:
pass # keep schedule.py process running in container
pass # keep schedule.py process running in container
3 changes: 2 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ services:
dockerfile: ./Dockerfiles/Dockerfile.geocml-task-scheduler
image: ghcr.io/geocml/geocml-base-deployment:task-scheduler
hostname: geocml-task-scheduler
environment:
- PATH=/geocml-task-scheduler/tabor/dist/tabor:$PATH
networks:
- geocml-network
volumes:
Expand Down Expand Up @@ -55,4 +57,3 @@ networks:
geocml-network:
external: true
driver: bridge
...
Loading