Merge pull request #383 from ipdgroup/master

Implementing incremental by files, safer version of incremental backup.
josegonzalez · Feb 1, 2025 · 095b712 · 095b712
2 parents 3a4aebb + 0f34ecb
commit 095b712
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 4 deletions.
diff --git a/README.rst b/README.rst
@@ -80,6 +80,7 @@ CLI Help output::
                             log level to use (default: info, possible levels:
                             debug, info, warning, error, critical)
       -i, --incremental     incremental backup
+      --incremental-by-files incremental backup using modified time of files
       --starred             include JSON output of starred repositories in backup
       --all-starred         include starred repositories in backup [*]
       --watched             include JSON output of watched repositories in backup
@@ -239,6 +240,12 @@ Using (``-i, --incremental``) will only request new data from the API **since th
 
 This means any blocking errors on previous runs can cause a large amount of missing data in backups.
 
+Using (``--incremental-by-files``) will request new data from the API **based on when the file was modified on filesystem**. e.g. if you modify the file yourself you may miss something.
+
+Still saver than the previous version.
+
+Specifically, issues and pull requests are handled like this.
+
 Known blocking errors
 ---------------------
 

diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py
@@ -181,6 +181,12 @@ def parse_args(args=None):
         dest="incremental",
         help="incremental backup",
     )
+    parser.add_argument(
+        "--incremental-by-files",
+        action="store_true",
+        dest="incremental_by_files",
+        help="incremental backup based on modification date of files",
+    )
     parser.add_argument(
         "--starred",
         action="store_true",
@@ -1114,16 +1120,24 @@ def backup_issues(args, repo_cwd, repository, repos_template):
     comments_template = _issue_template + "/{0}/comments"
     events_template = _issue_template + "/{0}/events"
     for number, issue in list(issues.items()):
+        issue_file = "{0}/{1}.json".format(issue_cwd, number)
+        if args.incremental_by_files and os.path.isfile(issue_file):
+            modified = os.path.getmtime(issue_file)
+            modified = datetime.fromtimestamp(modified).strftime("%Y-%m-%dT%H:%M:%SZ")
+            if modified > issue["updated_at"]:
+                logger.info("Skipping issue {0} because it wasn't modified since last backup".format(number))
+                continue
+
         if args.include_issue_comments or args.include_everything:
             template = comments_template.format(number)
             issues[number]["comment_data"] = retrieve_data(args, template)
         if args.include_issue_events or args.include_everything:
             template = events_template.format(number)
             issues[number]["event_data"] = retrieve_data(args, template)
 
-        issue_file = "{0}/{1}.json".format(issue_cwd, number)
-        with codecs.open(issue_file, "w", encoding="utf-8") as f:
+        with codecs.open(issue_file + ".temp", "w", encoding="utf-8") as f:
             json_dump(issue, f)
+            os.rename(issue_file + ".temp", issue_file) # Unlike json_dump, this is atomic
 
 
 def backup_pulls(args, repo_cwd, repository, repos_template):
@@ -1176,6 +1190,13 @@ def backup_pulls(args, repo_cwd, repository, repos_template):
     comments_template = _pulls_template + "/{0}/comments"
     commits_template = _pulls_template + "/{0}/commits"
     for number, pull in list(pulls.items()):
+        pull_file = "{0}/{1}.json".format(pulls_cwd, number)
+        if args.incremental_by_files and os.path.isfile(pull_file):
+            modified = os.path.getmtime(pull_file)
+            modified = datetime.fromtimestamp(modified).strftime("%Y-%m-%dT%H:%M:%SZ")
+            if modified > pull["updated_at"]:
+                logger.info("Skipping pull request {0} because it wasn't modified since last backup".format(number))
+                continue
         if args.include_pull_comments or args.include_everything:
             template = comments_regular_template.format(number)
             pulls[number]["comment_regular_data"] = retrieve_data(args, template)
@@ -1185,9 +1206,9 @@ def backup_pulls(args, repo_cwd, repository, repos_template):
             template = commits_template.format(number)
             pulls[number]["commit_data"] = retrieve_data(args, template)
 
-        pull_file = "{0}/{1}.json".format(pulls_cwd, number)
-        with codecs.open(pull_file, "w", encoding="utf-8") as f:
+        with codecs.open(pull_file + ".temp", "w", encoding="utf-8") as f:
             json_dump(pull, f)
+            os.rename(pull_file + ".temp", pull_file) # Unlike json_dump, this is atomic
 
 
 def backup_milestones(args, repo_cwd, repository, repos_template):