""" By Brian Tomasik (https://briantomasik.com/). First published: 2019-11-17. Last update of any kind: 2020-11-14T19-45. To the extent possible under law, the author(s) have dedicated all copyright and related and neighboring rights to this software to the public domain worldwide. This software is distributed without any warranty. This script was built with Python 3.6. Hopefully it should work for later versions of Python 3 also. For explanation of this script, see https://briantomasik.com/organizing-computer-files/#backup_script When first testing this script, I ran it as follows:
cd ~/test_files/ && python3 ~/files/yebs/processed/scripts_ytmc/backup.py -t -l
This allowed for testing the script out on dummy data and saving to a dummy "external" drive that was actually just a folder on my computer. Almost all the testing could be done with this configuration. For the last bits of testing, I first dropped the `-l` parameter and tested it on dummy folders on a real external drive. Finally, to run the script for real on my actual data, I ran:
cd ~/files/ && python3 ~/files/yebs/processed/scripts_ytmc/backup.py
""" #import pdb from sys import exit import argparse import os from pathlib import Path from random import choice from datetime import datetime from time import sleep import subprocess # Don't change these: FORMAT_OPTIONS = [".7z", ".tar.7z", ".tar.gz.gpg", ".tar.bz2.gpg", ".zip.gpg"] YEBS_DIR = "yebs" DEBS_DIR = "debs" TEMP_DIR = "temp" CHECKSUMS_FILENAME = "sha512-of-everything_made-just-before-backup.txt" YES = "" # You can change these: VERBOSE = False HOME_DIRS_THAT_CHANGE_FREQUENTLY = [".cache", ".local", ".mozilla"] # You can expand this list over time as needed. def print_if_verbose(text_string): if VERBOSE: print(text_string) def parse_input(): parser = argparse.ArgumentParser(description="Script to back up personal files on your computer to an external hard drive or flash drive.") parser.add_argument("-l", "--local_dest", action="store_true", help="Run with a dummy local destination folder rather than an actual external drive. Choosing this option requires that you've set up the right local folder structure yourself.") parser.add_argument("-t", "--use_test_folders", action="store_true", help="Run the script in a dummy folder called `test_files/` rather than the real `files/` folder. And when sending files to the external drive, send them to `test_files/` rather than the usual `files/`.") parser.add_argument("-f", "--format", choices=FORMAT_OPTIONS, help="Rather than choosing randomly, use the specified backup format.") return parser.parse_args() def test_that_required_programs_exist(): programs_to_test = [] programs_to_test.append(['rsync', '--version']) programs_to_test.append(['shasum', '--version']) programs_to_test.append(['zip', '--help']) programs_to_test.append(['tar', '--version']) programs_to_test.append(['7z', 'i']) programs_to_test.append(['gpg', '--version']) for command in programs_to_test: subprocess.run(command, stdout=subprocess.PIPE, check=True) # Capture stdout to avoid printing it out to the screen. def recursively_list_all_files_and_all_dirs(path_object): all_files = [] all_dirs = [] for root, dirs, files in os.walk(path_object): for cur_file in files: all_files.append(os.path.join(root, cur_file)) for cur_dir in dirs: all_dirs.append(os.path.join(root, cur_dir)) return all_files, all_dirs def this_file_or_dir_changes_frequently(path_string, username): home_slash_username = "/home/" + username + "/" for folder in HOME_DIRS_THAT_CHANGE_FREQUENTLY: if path_string.startswith(home_slash_username + folder + "/"): return True return False def recursively_list_all_files_and_all_dirs_except_some_that_change_frequently(path_object, username): all_files, all_dirs = recursively_list_all_files_and_all_dirs(path_object) filtered_files = [f for f in all_files if not this_file_or_dir_changes_frequently(f, username)] filtered_dirs = [d for d in all_dirs if not this_file_or_dir_changes_frequently(d, username)] return filtered_files, filtered_dirs def count_files_and_dirs(path_object): all_files, all_dirs = recursively_list_all_files_and_all_dirs(path_object) return len(all_files), len(all_dirs) def get_username(): username = Path.home().parts[-1] get_user_confirmation("Is '{}' your username?".format(username)) return username def doing_debs_backup(): answer = input("This script always backs up '{}/'. Do you also want to back up '{}/'? ".format(YEBS_DIR, DEBS_DIR)) return answer == YES def get_user_confirmation(question): answer = input(question + " ") if answer != YES: exit("Aborting script...") def check_that_your_computer_has_the_right_folder_structure(username, files_or_test_files): current_working_dir = Path.cwd() what_your_working_dir_should_be = "/home/{}/{}".format(username, files_or_test_files) assert str(current_working_dir) == what_your_working_dir_should_be, "Your working directory is wrong." assert os.path.isdir(YEBS_DIR), "There's no '{}' directory.".format(YEBS_DIR) assert os.path.isdir(DEBS_DIR), "There's no '{}' directory.".format(DEBS_DIR) assert os.path.isdir(TEMP_DIR), "There's no '{}' directory.".format(TEMP_DIR) checksums_filename_with_path = os.path.join(YEBS_DIR, CHECKSUMS_FILENAME) assert_file_doesnt_exist_yet(checksums_filename_with_path) def list_of_files_and_folders_in_dir(path_object): return [str(item) for item in path_object.iterdir()] def assert_file_doesnt_exist_yet(file_path): assert not os.path.exists(file_path), "The file we want to create ({}) already exists.".format(file_path) def assert_exists_and_is_dir(path_object): assert path_object.exists(), "'{}' doesn't exist.".format(str(path_object)) assert path_object.is_dir(), "'{}' isn't a directory.".format(str(path_object)) def get_paths_to_target_dirs(local_dest, username, files_or_test_files): if local_dest: temp_dir = Path("temp") assert_exists_and_is_dir(temp_dir) media_dir = temp_dir / "local-media" else: media_dir = Path("/media") assert_exists_and_is_dir(media_dir) media_slash_username = media_dir / username assert_exists_and_is_dir(media_slash_username) assert len(list_of_files_and_folders_in_dir(media_slash_username)) == 1, "There should be exactly 1 item in '{}'.".format(str(media_slash_username)) external_drive_path_object = [item for item in media_slash_username.iterdir()][0] assert_exists_and_is_dir(external_drive_path_object) external_drive_name = external_drive_path_object.parts[-1] get_user_confirmation("Is '{}' your external drive's name?".format(external_drive_name)) external_drive_folder = files_or_test_files external_main_files_dir = external_drive_path_object / external_drive_folder assert_exists_and_is_dir(external_main_files_dir) assert len(list_of_files_and_folders_in_dir(external_main_files_dir)) == 2, "There should be exactly 2 subdirectories: '{}' and '{}'. And no non-directory files.".format(YEBS_DIR, DEBS_DIR) external_yebs_dir = external_main_files_dir / YEBS_DIR assert_exists_and_is_dir(external_yebs_dir) external_debs_dir = external_main_files_dir / DEBS_DIR assert_exists_and_is_dir(external_debs_dir) return external_yebs_dir, external_debs_dir def create_checksums_file(): print_if_verbose("Creating the '{}/' backup's checksum file...".format(YEBS_DIR)) os.chdir(YEBS_DIR) lines_of_checksum_file = [] for root, dirs, files in os.walk('.'): for cur_file in files: cur_file_path = os.path.join(root, cur_file) proc = subprocess.run(['shasum', '-a', '512', cur_file_path], stdout=subprocess.PIPE, check=True, universal_newlines=True) lines_of_checksum_file.append(proc.stdout.strip()) with open(CHECKSUMS_FILENAME, 'w') as output_file: output_file.write('\n'.join(lines_of_checksum_file) + '\n') print_if_verbose("Checksum file created. Now running `shasum -c` on it...") checking_proc = subprocess.run(['shasum', '-c', CHECKSUMS_FILENAME], stdout=subprocess.PIPE, check=True, universal_newlines=True) stdout_lines = checking_proc.stdout.strip().split('\n') for line in stdout_lines: assert line.startswith('./'), "`shasum -c` output line doesn't start with './'" assert line.endswith(': OK'), "`shasum -c` output line doesn't end with ': OK'" print_if_verbose("The `shasum -c` check worked. :)") os.chdir('..') return os.path.join(YEBS_DIR, CHECKSUMS_FILENAME) def create_backup_on_external_drive(format_to_use, external_yebs_dir, datestamp): parts = format_to_use.split('.') intermediate_extension = ".".join(parts[:-1]) if intermediate_extension != "": intermediate_path = create_intermediate_file(intermediate_extension, YEBS_DIR) else: intermediate_path = YEBS_DIR this_variable_is_not_used = input("Now for encryption. Waiting for you to press ENTER to begin. ") create_final_file(format_to_use, intermediate_path, external_yebs_dir, datestamp) def create_intermediate_file(intermediate_extension, dir_to_zip_up): print("Creating", intermediate_extension, "file...") where_to_save_output = "tmp_" + YEBS_DIR + intermediate_extension assert_file_doesnt_exist_yet(where_to_save_output) if "zip" in intermediate_extension: program = "zip" flags = "-rq" else: assert "tar" in intermediate_extension, "Neither 'zip' nor 'tar' is in the `intermediate_extension`, which is '{}'.".format(intermediate_extension) program = "tar" if "gz" in intermediate_extension: flags = "-czf" elif "bz2" in intermediate_extension: flags = "-cjf" else: flags = "-cf" subprocess.run([program, flags, where_to_save_output, dir_to_zip_up], check=True) return where_to_save_output def create_final_file(format_to_use, intermediate_path, external_yebs_dir, datestamp): where_to_save_output = os.path.join(str(external_yebs_dir), datestamp + "_ee" + format_to_use) assert_file_doesnt_exist_yet(where_to_save_output) if "7z" in format_to_use: subprocess.run(["7z", "a", "-p", "-mhe=on", "-ms=off", where_to_save_output, intermediate_path], check=True) else: assert "gpg" in format_to_use, "The final extension is neither '.7z' nor '.gpg'. The format is '{}'.".format(format_to_use) subprocess.run(["gpg", "-v", "-o", where_to_save_output, "-c", intermediate_path], check=True) if os.path.isfile(intermediate_path): # Only delete it if it's a file, not if it's a dir. BTW, even if we didn't do this check, there wouldn't be risk of accidentally deleting our data directory because `os.remove()` fails if the path is a directory. os.remove(intermediate_path) def do_yebs_backup(specific_output_format, external_yebs_dir): print("Starting '{}/' backup.".format(YEBS_DIR)) format_to_use = specific_output_format if specific_output_format else choice(FORMAT_OPTIONS) datestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") checksums_file_path = create_checksums_file() create_backup_on_external_drive(format_to_use, external_yebs_dir, datestamp) os.remove(checksums_file_path) print("Done with '{}/' backup.\n".format(YEBS_DIR)) def do_debs_backup(external_debs_dir): print("In 5 seconds, you'll see a dry run of what rsync would do:") print("------------------------------") sleep(5) # The delay is so the user can read the above message even if the output of the rsync command is so long that it fills the whole Terminal history. ending_slash_needed_to_make_rsync_work_correctly = "/" subprocess.run(['rsync', '-avn', '--delete', DEBS_DIR + ending_slash_needed_to_make_rsync_work_correctly, str(external_debs_dir) + ending_slash_needed_to_make_rsync_work_correctly], check=True) print("------------------------------") get_user_confirmation("Does this look ok?") print("Now running rsync for real in 5 seconds...") print("------------------------------") sleep(5) subprocess.run(['rsync', '-av', '--delete', DEBS_DIR + ending_slash_needed_to_make_rsync_work_correctly, str(external_debs_dir) + ending_slash_needed_to_make_rsync_work_correctly], check=True) print("------------------------------") print("Done with rsync.") print("") def check_if_differences_between_two_lists(initial_list, final_list, files_or_dirs): if initial_list == final_list: print_if_verbose("\nThe list of {} under ~ on your computer between starting and finishing this script is the same. :)".format(files_or_dirs)) else: print("\nERROR: The list of {} under ~ on your computer between starting and finishing this script is NOT the same.".format(files_or_dirs)) lost_items = list(set(initial_list) - set(final_list)) new_items = list(set(final_list) - set(initial_list)) if len(lost_items) > 0: print("The following {} were present initially but were lost by running the script. If they're important, restore them from a backup!!! And figure out what went wrong.".format(files_or_dirs)) for item in lost_items: print('\t' + item) if len(new_items) > 0: print("The following {} weren't present initially but were added by running the script. After figuring out what went wrong, you should clean them up by hand.".format(files_or_dirs)) for item in new_items: print('\t' + item) def main(): args = parse_input() test_that_required_programs_exist() files_or_test_files = "test_files" if args.use_test_folders else "files" print("FYI: This script asks you a few questions. To say yes, just press ENTER. To say no, type any letter and then press ENTER.") username = get_username() check_that_your_computer_has_the_right_folder_structure(username, files_or_test_files) external_yebs_dir, external_debs_dir = get_paths_to_target_dirs(args.local_dest, username, files_or_test_files) initial_all_computer_home_files, initial_all_computer_home_dirs = recursively_list_all_files_and_all_dirs_except_some_that_change_frequently(Path.home(), username) try: initial_external_yebs_file_count, initial_external_yebs_dir_count = count_files_and_dirs(external_yebs_dir) initial_external_debs_file_count, initial_external_debs_dir_count = count_files_and_dirs(external_debs_dir) if doing_debs_backup(): do_debs_backup(external_debs_dir) do_yebs_backup(args.format, external_yebs_dir) final_external_yebs_file_count, final_external_yebs_dir_count = count_files_and_dirs(external_yebs_dir) final_external_debs_file_count, final_external_debs_dir_count = count_files_and_dirs(external_debs_dir) assert initial_external_yebs_file_count + 1 == final_external_yebs_file_count, "This script should have added exactly one file to the external 'yebs/' directory. Instead, here are the numbers:\ninitial_external_yebs_file_count = {}\nfinal_external_yebs_file_count = {}".format(initial_external_yebs_file_count, final_external_yebs_file_count) assert initial_external_yebs_dir_count == final_external_yebs_dir_count, "This script shoudln't have changed the number of subdirectories within the external 'yebs/' directory, but in fact, here are the numbers:\ninitial_external_yebs_dir_count = {}\nfinal_external_yebs_dir_count = {}".format(initial_external_yebs_dir_count, final_external_yebs_dir_count) print("While running this script, in '{}':".format(str(external_debs_dir))) print(" the number of files changed from {} to {}".format(initial_external_debs_file_count, final_external_debs_file_count)) print(" the number of directories changed from {} to {}".format(initial_external_debs_dir_count, final_external_debs_dir_count)) finally: final_all_computer_home_files, final_all_computer_home_dirs = recursively_list_all_files_and_all_dirs_except_some_that_change_frequently(Path.home(), username) print("") if len(initial_all_computer_home_files) != len(final_all_computer_home_files): print("NOTE: The number of all computer home files changed while running this script, from {} to {}.".format(len(initial_all_computer_home_files), len(final_all_computer_home_files))) if len(initial_all_computer_home_dirs) != len(final_all_computer_home_dirs): print("NOTE: The number of all computer home dirs changed while running this script, from {} to {}.".format(len(initial_all_computer_home_dirs), len(final_all_computer_home_dirs))) check_if_differences_between_two_lists(initial_all_computer_home_files, final_all_computer_home_files, "files") check_if_differences_between_two_lists(initial_all_computer_home_dirs, final_all_computer_home_dirs, "directories") print("") print("Done.") if __name__ == "__main__": main()