""" By Brian Tomasik (https://briantomasik.com/). First published: 2019-12-13. Last update of any kind: 2020-04-08T04-19. To the extent possible under law, the author(s) have dedicated all copyright and related and neighboring rights to this software to the public domain worldwide. This software is distributed without any warranty. This script was built with Python 3.6. Hopefully it should work for later versions of Python 3 also. I assume a Unix-like operating system where files have inodes. # Description This script monitors file fixity and attendance for all your files under a given directory, except for filepath substrings you exclude from monitoring. To use the script, just run python3 run.py Run the script twice, choosing the 'os' ("often small") option the first time and the 'ol' ("often large") option the second time (or do it in the opposite order -- it doesn't matter). Then rerun the script any time you want to check on your data and refresh the latest list of checksums. For more explanation of this script, see https://briantomasik.com/organizing-computer-files/#fixity_checking """ #import pdb import json from sys import exit import hashlib import os from datetime import datetime from collections import defaultdict IGNORE_FILE_PATH = "ignore.txt" DATETIME_FORMAT = '%Y-%m-%dT%H-%M-%S' BUFFER_SIZE_FOR_SHA_CALCULATION = 10**6 # The value doesn't matter too much. I tested empirically that a number too low (e.g., 10**2) slows things down by a factor of ~2. A higher number (e.g., 10**9) will of course use more RAM. It seems that using a number higher than ~10**6 doesn't really speed things up. HORIZONTAL_LINE = "-" * 10 SEPARATOR_TO_DELINEATE_OLD_FROM_NEW_OUTPUT = "*" * 40 KEEP_TEMPORARY_PROGRESS_OUTPUT_AT_MOST_THIS_MANY_CHARS = 70 NAME_OF_HISTORY_FOLDER_EXCEPT_FOR_CLASS_PREFIX = "-history_snapshots-when-everything-looked-ok" home_dir = os.path.expanduser("~") # Tweakable parameters FILE_TYPES_THAT_ARE_OFTEN_LARGE = ['.mp4', '.mov', '.qt', '.flv', '.mpeg', '.mpv', '.avi', '.wmv', '.m4v', '.m4p', '.m4a', '.webm', '.wav', '.mp3', '.aac', '.flac', '.au', '.ogg', '.mka', '.mkv', '.zip', '.gz', '.bz2', '.rar', '.7z', '.gpg', '.tar', '.tar.gz', '.tgz', '.tar.bz2', '.pdf', '.bin', '.jpg', '.jpeg', '.png', '.gif', '.tiff', '.itc'] # You can find more extensions here: https://en.wikipedia.org/wiki/List_of_file_formats EXTENSIONS_OF_FILES_THAT_TEND_TO_CHANGE_A_LOT = [".txt", ".md"] ALWAYS_IGNORE_THESE_FILEPATH_SUBSTRINGS_IN_ADDITION_TO_WHATS_IN_THE_IGNORE_FILE = [NAME_OF_HISTORY_FOLDER_EXCEPT_FOR_CLASS_PREFIX, "/files/temp/"] MONITOR_THE_DATA_UNDER_THIS_DIR = os.path.join(home_dir, "files") IGNORE_ABSOLUTE_VALUE_PERCENT_CHANGES_LESS_THAN_THIS_PERCENT_FOR_COUNTS_BY_FILE_TYPE = 1 assert IGNORE_ABSOLUTE_VALUE_PERCENT_CHANGES_LESS_THAN_THIS_PERCENT_FOR_COUNTS_BY_FILE_TYPE >= 0 WARN_ABOUT_CHANGED_INODE_VALUE_FOR_SAME_FILEPATH_AND_CHECKSUM = False def ask_to_clear_Terminal(): answer = input("Do you want to clear the Terminal so far? Enter 'y' if yes or anything else if no. ") if answer == 'y': os.system('clear') # More reading: https://stackoverflow.com/questions/2084508/clear-terminal-in-python def datetime_to_string(datetime_input): return datetime_input.strftime(DATETIME_FORMAT) def string_to_datetime(string_input): return datetime.strptime(string_input, DATETIME_FORMAT) def abort_with_explanation(most_recent_history_file): print("\nI see you've indicated that something doesn't look right with the changes I've shown. Therefore, I'm aborting the script here without saving any results to disk. You should investigate what's wrong, possibly restoring files from backups if need be. You can look in the history file '{}' for the old SHA-512 hash of a file if you need to check that. Once you think your files are fixed, rerun this script and review the output again. You can keep doing this as many times as needed to make sure everything looks good. Once things finally do look good, at that point you can save the new history file to disk as the new baseline.".format(most_recent_history_file)) exit() def divide_items_based_on_whether_tend_to_change_a_lot_and_apply_filters(items, filters_list): tend_to_change_a_lot = [] other = [] for orig_file_path, additional_info in items: if not string_contains_any_substring(orig_file_path, filters_list): if string_ends_with_any_substring(orig_file_path.lower(), EXTENSIONS_OF_FILES_THAT_TEND_TO_CHANGE_A_LOT): tend_to_change_a_lot.append((orig_file_path, additional_info)) else: other.append((orig_file_path, additional_info)) return tend_to_change_a_lot, other def print_particular_items_list(particular_items): for orig_file_path, additional_info in particular_items: print("\n" + orig_file_path) if additional_info != "": print(" `--> {}".format(additional_info)) def print_items(items, filters_list): print(HORIZONTAL_LINE) tend_to_change_a_lot, other = divide_items_based_on_whether_tend_to_change_a_lot_and_apply_filters(items, filters_list) if len(tend_to_change_a_lot) > 0: print("# HERE ARE THE ITEMS WITH FILE TYPES SUGGESTING THAT THESE FILES MAY TEND TO CHANGE A LOT (meaning that changes in file content may be unsurprising):") print_particular_items_list(tend_to_change_a_lot) if len(other) > 0: if len(tend_to_change_a_lot) > 0: print("\n") # Add two newlines to more clearly separate the two sections. print("# HERE ARE THE ITEMS WITH FILE TYPES SUGGESTING THAT THESE FILES MAY BE RELATIVELY STATIC (meaning that changes in file content may be more unexpected):") print_particular_items_list(other) print(HORIZONTAL_LINE) def get_feedback_on_items(items, description, most_recent_history_file): if len(items) == 0: return filters_list = [] while True: ask_to_clear_Terminal() print() print(description) print_items(items, filters_list) message = "Enter 'y' if everything here looks fine, 'f' if you'd like to filter out items from this list that contain a given substring (to make reviewing the list easier)" show_the_c_option = len(filters_list) > 0 if show_the_c_option: message = message.replace('to filter out', 'to further filter out') message = message.replace('a given substring', 'another given substring') message += ", 'c' if you want to clear all current filters applied to this list" message += ", or anything else to abort the script. " user_answer = input(message) options_offered_to_user = ['y', 'f'] if show_the_c_option: options_offered_to_user.append('c') if user_answer not in options_offered_to_user: abort_with_explanation(most_recent_history_file) elif user_answer == 'c': filters_list = [] elif user_answer == 'f': substring_to_filter = input("What substring would you like to filter? (This substring will be applied only to the original file path, not to the additional data about the file.) ") if len(substring_to_filter) == 0: print("WARNING: Your empty substring will filter out all results.") filters_list.append(substring_to_filter) else: assert user_answer == 'y' print_section_separator() return # The following function is mostly copied from the answer by "Randall Hunt" and "Community" in "[Hashing a file in Python](https://stackoverflow.com/questions/22058048/hashing-a-file-in-python)". def sha512(filename): sha512_hash = hashlib.sha512() with open(filename, 'rb') as cur_file: while True: data = cur_file.read(BUFFER_SIZE_FOR_SHA_CALCULATION) if not data: break sha512_hash.update(data) return sha512_hash.hexdigest() def create_inode_to_filepath_mapping(monitored_files_dict): inode_to_filepath_mapping = dict() for filepath in monitored_files_dict: inode = monitored_files_dict[filepath]['inode'] if inode in inode_to_filepath_mapping: print("WARNING: There's apparently more than one file path with the inode {}. Previously I saw that the file '{}' had that inode, and now I see that the file '{}' does. I think there should theoretically be a bijection between inodes and file paths at any given time, which implies that each inode should only have one file path associated with it. That's not true here. Did you edit/delete files while this script was running? That could have caused inode recycling, resulting in more than one file path with this inode.".format(inode, inode_to_filepath_mapping[inode], filepath)) inode_to_filepath_mapping[inode] = filepath return inode_to_filepath_mapping def file_extension_counts(list_of_file_extensions): counts = defaultdict(int) for extension in list_of_file_extensions: counts[extension] += 1 return counts def print_comparison_of_file_extension_counts(previous_list_of_file_extensions, new_list_of_file_extensions, threshold_for_absolute_value_of_percent_change): previous_file_extension_counts_dict = file_extension_counts(previous_list_of_file_extensions) new_file_extension_counts_dict = file_extension_counts(new_list_of_file_extensions) previous_keys = set(previous_file_extension_counts_dict.keys()) new_keys = set(new_file_extension_counts_dict.keys()) sorted_all_keys = sorted(previous_keys.union(new_keys)) for key in sorted_all_keys: # Since these are defaultdict, we can look up keys even if they don't exist: print_a_change_in_a_number(key, previous_file_extension_counts_dict[key], new_file_extension_counts_dict[key], threshold=threshold_for_absolute_value_of_percent_change) def file_size_counts(list_of_file_sizes): counts = dict() keys = ['less than a kilobyte', 'at least a kilobyte but less than a megabyte', 'at least a megabyte but less than a gigabyte', 'at least a gigabyte'] for key in keys: counts[key] = 0 for size in list_of_file_sizes: assert size >= 0 if size < 10**3: counts['less than a kilobyte'] += 1 elif size < 10**6: counts['at least a kilobyte but less than a megabyte'] += 1 elif size < 10**9: counts['at least a megabyte but less than a gigabyte'] += 1 else: counts['at least a gigabyte'] += 1 return counts def print_comparison_of_file_size_counts(previous_list_of_file_sizes, new_list_of_file_sizes): previous_file_size_counts_dict = file_size_counts(previous_list_of_file_sizes) new_file_size_counts_dict = file_size_counts(new_list_of_file_sizes) for key in previous_file_size_counts_dict: print_a_change_in_a_number(key, previous_file_size_counts_dict[key], new_file_size_counts_dict[key]) def print_section_separator(): print("\n" + SEPARATOR_TO_DELINEATE_OLD_FROM_NEW_OUTPUT + "\n") def ask_if_that_looks_ok(most_recent_history_file): answer = input("Does that look ok? Enter 'y' if yes or anything else to abort the script. ") if answer != 'y': abort_with_explanation(most_recent_history_file) else: print_section_separator() def print_a_change_in_a_number(brief_description, old_value, new_value, threshold=0): assert old_value >= 0, "Counts or sizes can't be negative." assert new_value >= 0, "Counts or sizes can't be negative." if old_value == 0 and new_value > 0: percent_change_string = "infinity" else: if old_value == 0 and new_value == 0: percent_change = 0 else: percent_change = 100 * (new_value - old_value)/old_value if abs(percent_change) < threshold: return percent_change_string = "{:.2f}".format(percent_change) print("\t{}: {}% ({:,} -> {:,}).".format(brief_description, percent_change_string, old_value, new_value)) def review_sanity_checks(previous_stats, new_stats, most_recent_history_file, file_type_class): print("First, let's review some basic info and sanity checks on your data before looking at particular files.") if previous_stats['dir monitored'] != new_stats['dir monitored']: print("WARNING: Previously you were monitoring the directory '{}', but now you're monitoring '{}'.".format(previous_stats['dir monitored'], new_stats['dir monitored'])) ask_if_that_looks_ok(most_recent_history_file) if previous_stats['ignored substrings'] != new_stats['ignored substrings']: print("WARNING: The list of substrings you're ignoring changed. It used to be:") print(previous_stats['ignored substrings']) print("Now it's:") print(new_stats['ignored substrings']) ask_if_that_looks_ok(most_recent_history_file) if previous_stats['list of often large file types'] != new_stats['list of often large file types']: print("WARNING: The list of often large file types changed. It used to be:") print(previous_stats['list of often large file types']) print("Now it's:") print(new_stats['list of often large file types']) print("This might cause some files to appear missing even though what actually happened is that they're now in the opposite file-type class.") ask_if_that_looks_ok(most_recent_history_file) print("Here are counts of all files and of all dirs, including those that you normally filter out:") print_a_change_in_a_number("number of all files", previous_stats['number of all files'], new_stats['number of all files']) print_a_change_in_a_number("number of all dirs", previous_stats['number of all dirs'], new_stats['number of all dirs']) print("And here are counts of all files and of all dirs after filtering out the ignorable ones:") print_a_change_in_a_number("number of files", previous_stats['number of files after filtering out ignorable ones'], new_stats['number of files after filtering out ignorable ones']) print_a_change_in_a_number("number of dirs", previous_stats['number of dirs after filtering out ignorable ones'], new_stats['number of dirs after filtering out ignorable ones']) print("Now looking at just those files after filtering ignorable ones that also fall within the '{}' file-type class:".format(file_type_class)) print_a_change_in_a_number("number of files", previous_stats['number of monitored files (i.e., filtered files that are also in the relevant file-type class)'], new_stats['number of monitored files (i.e., filtered files that are also in the relevant file-type class)']) print_a_change_in_a_number("sum total size of all monitored files", previous_stats['total size of all monitored files'], new_stats['total size of all monitored files']) ask_if_that_looks_ok(most_recent_history_file) print("Now for some distributional statistics about non-ignorable files within the '{}' file-type class.".format(file_type_class)) print("Here are changes in how many files fall within each file-size bucket:") print_comparison_of_file_size_counts(previous_stats['list of file sizes of monitored files'], new_stats['list of file sizes of monitored files']) ask_if_that_looks_ok(most_recent_history_file) print("Here are changes in how many files have a given file extension (ignoring cases where the absolute-value percent change was less than {}%):".format(IGNORE_ABSOLUTE_VALUE_PERCENT_CHANGES_LESS_THAN_THIS_PERCENT_FOR_COUNTS_BY_FILE_TYPE)) print_comparison_of_file_extension_counts(previous_stats['list of file extensions in monitored files'], new_stats['list of file extensions in monitored files'], IGNORE_ABSOLUTE_VALUE_PERCENT_CHANGES_LESS_THAN_THIS_PERCENT_FOR_COUNTS_BY_FILE_TYPE) ask_if_that_looks_ok(most_recent_history_file) def review_changes(previous_stats, new_stats, most_recent_history_file, file_type_class): review_sanity_checks(previous_stats, new_stats, most_recent_history_file, file_type_class) previous_monitored_files_dict = previous_stats['monitored files'] new_monitored_files_dict = new_stats['monitored files'] inode_to_filepath_mapping_for_new_monitored_files = create_inode_to_filepath_mapping(new_monitored_files_dict) same_filepath_different_checksums_same_mtime = [] same_filepath_different_checksums_newer_mtime = [] file_moved_same_checksums = [] cant_find_the_file = [] for filepath in previous_monitored_files_dict: previous_properties = previous_monitored_files_dict[filepath] if filepath in new_monitored_files_dict: # File still exists in the same location. new_properties = new_monitored_files_dict[filepath] if previous_properties['sha512'] == new_properties['sha512']: # Things look good. No need to report on this case. if previous_properties['inode'] != new_properties['inode']: if WARN_ABOUT_CHANGED_INODE_VALUE_FOR_SAME_FILEPATH_AND_CHECKSUM: print("WARNING: File '{}' stayed in the same location and has the same checksum value, but for some reason the inode value changed from {} to {}. Maybe you moved to a new hard drive?".format(filepath, previous_properties['inode'], new_properties['inode'])) else: previous_mtime = string_to_datetime(previous_properties['mtime']) new_mtime = string_to_datetime(new_properties['mtime']) if new_mtime == previous_mtime: same_filepath_different_checksums_same_mtime.append((filepath, "The mtime value of '{}' is the same now as it was in the most recent history file.".format(new_properties['mtime']))) else: explanation_if_get_warning_template = "This shouldn't have happened if you just edited the file; in that case the new mtime should have been more recent. But this could have happened if you renamed an already existing file that had an older mtime value to the current file name. Think about whether that's what happened here. Example: Suppose your file was named 'A.txt' and had REPLACE_ME of midnight on 2019-01-01. Suppose you also had a file 'B.txt' with mtime of midnight on 2018-12-01. Suppose you deleted 'A.txt' and then renamed 'B.txt' to 'A.txt'. Then it would appear that the mtime for 'A.txt' was older, generating this warning.\n" if new_mtime < previous_mtime: print("WARNING: For file '{}', 'new_mtime' of '{}' is earlier than 'previous_mtime' of '{}'. {}".format(filepath, datetime_to_string(new_mtime), datetime_to_string(previous_mtime), explanation_if_get_warning_template.replace("REPLACE_ME", "mtime"))) if new_mtime < string_to_datetime(previous_properties['datetime sha512 was computed']): print("WARNING: For file '{}', 'new_mtime' of '{}' does not equal the 'previous_mtime' of '{}', but for some reason, 'new_mtime' is earlier than previous 'datetime sha512 was computed' of '{}'. {}".format(filepath, datetime_to_string(new_mtime), datetime_to_string(previous_mtime), previous_properties['datetime sha512 was computed'], explanation_if_get_warning_template.replace("REPLACE_ME", "'datetime sha512 was computed'"))) same_filepath_different_checksums_newer_mtime.append((filepath, "File was last modified on '{}'.".format(new_properties['mtime']))) else: previous_inode = previous_properties['inode'] if previous_inode in inode_to_filepath_mapping_for_new_monitored_files: new_filepath_with_that_inode = inode_to_filepath_mapping_for_new_monitored_files[previous_inode] new_properties = new_monitored_files_dict[new_filepath_with_that_inode] if previous_properties['sha512'] == new_properties['sha512']: file_moved_same_checksums.append((filepath, "New file path is '{}'.".format(new_filepath_with_that_inode))) else: cant_find_the_file.append((filepath, "")) else: cant_find_the_file.append((filepath, "")) get_feedback_on_items(same_filepath_different_checksums_same_mtime, "These items kept the same file path, but the checksums differ, and the mtime stayed the same! That means THESE ARE PLAUSIBLE CANDIDATES FOR BIT ROT, since it doesn't look like you explicitly modified these files since last time (did you?).", most_recent_history_file) get_feedback_on_items(same_filepath_different_checksums_newer_mtime, "These items kept the same file path, but the checksums differ. The new mtime is more recent than the old mtime and more recent than when the previous checksum was computed (except in the rare event that this script just now printed warnings to the contrary). Probably you just edited these files (did you?).", most_recent_history_file) get_feedback_on_items(file_moved_same_checksums, "Each of these items changed file path but kept the same inode and the same checksum, so probably it was moved.", most_recent_history_file) get_feedback_on_items(cant_find_the_file, "Following are files that I knew about last time you ran this script but that I can't reliably find this time around. Here are possible reasons why: 1. The file may have been deleted. 2. The file may have been both moved (so the file path isn't the same) and edited (so the checksum isn't the same). Without at least one of those values staying constant, I can't be sure I've found the file. 3. Maybe you edited parameters to this script like which file extensions are 'os' versus 'ol', which file-path substrings to ignore, etc.", most_recent_history_file) final_confirmation = input("Those were all the things to review, and you said they all looked ok. Here's one final confirmation that everything looks good. Enter 'y' if yes or anything else to abort. Once you confirm, the new data statistics will be written to your history folder for this file-type class as the new baseline to use in the future.\nFINAL CONFIRMATION: ") if final_confirmation != 'y': abort_with_explanation(most_recent_history_file) def read_history_file(filepath): with open(filepath) as history_file: return json.load(history_file) def write_to_a_new_history_file(new_stats, history_dir): output_filename = datetime_to_string(datetime.now()) + ".json" output_filename_with_path = os.path.join(history_dir, output_filename) with open(output_filename_with_path, "w") as output_file: json.dump(new_stats, output_file, indent=2) print("Wrote a new history file: '{}'".format(output_filename_with_path)) def find_most_recent_history_file(history_dir): json_files = [f for f in os.listdir(history_dir) if f.endswith(".json")] most_recent_datetime_value = None most_recent_file_so_far = None for json_file in json_files: name, extension = os.path.splitext(json_file) datetime_value = string_to_datetime(name) if (most_recent_datetime_value == None) or (datetime_value > most_recent_datetime_value): most_recent_datetime_value = datetime_value most_recent_file_so_far = json_file if most_recent_file_so_far == None: return None else: most_recent_file_with_path = os.path.join(history_dir, most_recent_file_so_far) assert os.path.isfile(most_recent_file_with_path), "'{}' is not a file!".format(most_recent_file_with_path) print("Most recent history file is '{}'".format(most_recent_file_with_path)) return most_recent_file_with_path def create_history_dir_if_need_be(file_type_class): history_dir = file_type_class + NAME_OF_HISTORY_FOLDER_EXCEPT_FOR_CLASS_PREFIX if not os.path.isdir(history_dir): os.mkdir(history_dir) return history_dir def list_all_files_and_dirs(dir_to_walk): print("Getting a list of all files and a list of all folders under '{}'...".format(dir_to_walk)) all_files = [] all_dirs = [] for root, dirs, files in os.walk(dir_to_walk): for cur_file in files: all_files.append(os.path.join(root, cur_file)) for cur_dir in dirs: all_dirs.append(os.path.join(root, cur_dir)) return all_files, all_dirs def string_contains_any_substring(input_string, list_of_substrings): for substring in list_of_substrings: if substring in input_string: return True return False def string_ends_with_any_substring(input_string, list_of_substrings): for substring in list_of_substrings: if input_string.endswith(substring): return True return False def file_is_in_current_file_type_class(filename, file_type_class): cur_file_type_is_ol = string_ends_with_any_substring(filename.lower(), FILE_TYPES_THAT_ARE_OFTEN_LARGE) if file_type_class == "os": return not cur_file_type_is_ol else: assert file_type_class == "ol" return cur_file_type_is_ol def print_progress_without_going_to_new_line(one_line_string_to_print): assert '\n' not in one_line_string_to_print if len(one_line_string_to_print) > KEEP_TEMPORARY_PROGRESS_OUTPUT_AT_MOST_THIS_MANY_CHARS: dot_dot_dot = '...' length_before_dot_dot_dot = KEEP_TEMPORARY_PROGRESS_OUTPUT_AT_MOST_THIS_MANY_CHARS - len(dot_dot_dot) one_line_string_to_print = one_line_string_to_print[:length_before_dot_dot_dot] + dot_dot_dot # First, cover up with white spaces output from the previous line, in case it was longer than the output of the new line. print(' ' * KEEP_TEMPORARY_PROGRESS_OUTPUT_AT_MOST_THIS_MANY_CHARS + '\r', end="") # Now print the new line. print(one_line_string_to_print + '\r', end="") def convert_home_directory_to_tilde(absolute_path): return absolute_path.replace(home_dir, '~') def calculate_new_stats(dir_to_monitor, file_type_class, substrings_to_ignore): new_stats = dict() all_files, all_dirs = list_all_files_and_dirs(dir_to_monitor) new_stats['dir monitored'] = convert_home_directory_to_tilde(dir_to_monitor) new_stats['ignored substrings'] = substrings_to_ignore new_stats['list of often large file types'] = FILE_TYPES_THAT_ARE_OFTEN_LARGE new_stats['number of all files'] = len(all_files) new_stats['number of all dirs'] = len(all_dirs) filtered_files = [f for f in all_files if not string_contains_any_substring(f, substrings_to_ignore)] filtered_dirs = [d for d in all_dirs if not string_contains_any_substring(d, substrings_to_ignore)] new_stats['number of files after filtering out ignorable ones'] = len(filtered_files) new_stats['number of dirs after filtering out ignorable ones'] = len(filtered_dirs) files_to_monitor = [f for f in filtered_files if file_is_in_current_file_type_class(f, file_type_class)] new_stats['number of monitored files (i.e., filtered files that are also in the relevant file-type class)'] = len(files_to_monitor) monitored_files = dict() list_of_file_extensions_in_monitored_files = [] list_of_file_sizes_of_monitored_files = [] total_size_of_all_monitored_files = 0 number_file_being_worked_on = 0 print("Below you can see the number of the file being worked on relative to the total, and what its file name is.") for cur_file in files_to_monitor: cur_file_path, cur_file_name = os.path.split(cur_file) number_file_being_worked_on += 1 print_progress_without_going_to_new_line("{} / {} {}".format(number_file_being_worked_on, new_stats['number of monitored files (i.e., filtered files that are also in the relevant file-type class)'], cur_file_name)) base, extension = os.path.splitext(cur_file) list_of_file_extensions_in_monitored_files.append(extension) cur_file_properties = os.stat(cur_file) list_of_file_sizes_of_monitored_files.append(cur_file_properties.st_size) total_size_of_all_monitored_files += cur_file_properties.st_size properties_to_store = dict() properties_to_store['inode'] = cur_file_properties.st_ino properties_to_store['sha512'] = sha512(cur_file) properties_to_store['datetime sha512 was computed'] = datetime_to_string(datetime.now()) mtime_as_datetime = datetime.fromtimestamp(cur_file_properties.st_mtime) right_now = datetime.now() assert right_now >= mtime_as_datetime, "The mtime value for file '{}' of '{}' is later than the time now ({})?!".format(cur_file, mtime_as_datetime, right_now) properties_to_store['mtime'] = datetime_to_string(mtime_as_datetime) monitored_files[convert_home_directory_to_tilde(cur_file)] = properties_to_store new_stats['monitored files'] = monitored_files new_stats['list of file extensions in monitored files'] = list_of_file_extensions_in_monitored_files new_stats['list of file sizes of monitored files'] = list_of_file_sizes_of_monitored_files new_stats['total size of all monitored files'] = total_size_of_all_monitored_files return new_stats def ignore_list(): substrings_to_ignore = ALWAYS_IGNORE_THESE_FILEPATH_SUBSTRINGS_IN_ADDITION_TO_WHATS_IN_THE_IGNORE_FILE if os.path.isfile(IGNORE_FILE_PATH): with open(IGNORE_FILE_PATH) as cur_file: for line in cur_file: substrings_to_ignore.append(line.strip()) else: print("WARNING: You don't have an '{}' file. Only the default ignorable substrings will be ignored.".format(IGNORE_FILE_PATH)) return substrings_to_ignore def get_file_type_class(): answer = input("Enter 'os' to run this script for file types that are often small or 'ol' to run it for file types that are often large. Or enter 'h' to read more explanation about what this means. ") assert answer in ['os', 'ol', 'h'], "Your input '{}' wasn't one of the options.".format(answer) if answer == 'h': print("It's useful to divide the files monitored into those that are generally small and those that are generally large, allowing you to check just one or the other kind. The generally small files (like `.txt` files) tend to be more numerous and updated more often, but they're also faster to check. You may want to check them regularly without being slowed down by also checking larger, more static files all the time. This program creates two different history folders, one for each class (the generally small file types and the generally large file types).") answer = input("Now, with that explanation, enter one of 'os' or 'ol'. ") assert answer in ['os', 'ol'], "Your input '{}' wasn't one of the options.".format(answer) return answer def main(): assert os.path.isdir(MONITOR_THE_DATA_UNDER_THIS_DIR), "'{}' isn't a directory.".format(MONITOR_THE_DATA_UNDER_THIS_DIR) file_type_class = get_file_type_class() new_stats = calculate_new_stats(MONITOR_THE_DATA_UNDER_THIS_DIR, file_type_class, ignore_list()) history_dir = create_history_dir_if_need_be(file_type_class) most_recent_history_file = find_most_recent_history_file(history_dir) if most_recent_history_file != None: previous_stats = read_history_file(most_recent_history_file) review_changes(previous_stats, new_stats, most_recent_history_file, file_type_class) write_to_a_new_history_file(new_stats, history_dir) if __name__ == "__main__": main()