"""
By Brian Tomasik (https://briantomasik.com/).
First published: 2019-12-19. Last update of any kind: 2020-02-05T22-46.

To the extent possible under law, the author(s) have dedicated all copyright and related and neighboring rights to this software to the public domain worldwide. This software is distributed without any warranty.

This script was built with Python 3.6. Hopefully it should work for later versions of Python 3 also.


# Description

This script removes whitespace and other funky characters from the names of files and folders, keeping only alphanumeric characters, dashes, and underscores. The goal is to make the file and folder names nicer to work with on the command line or in url paths and less likely to cause problems. This script also shortens file and folder names that are unnervingly long, also to reduce the risk of technical problems.

If you run this script in directory `foo/`, it affects all files and folders recursively within `foo/`. It works from the bottom up: lowest-level files and folders are renamed before higher-level ones are renamed. (I worry that renaming the higher-level folders first could mess up renaming of the lower-level items, since the path to at least some lower-level items would have changed. I don't know if the `os.walk()` method I use could intelligently handle this issue or not, since I haven't bothered to test it.)

The transformation that this script applies to a file or folder name is usually idempotent, i.e., once a file or folder has been renamed, this script when run again on that file or folder would end up saying it should keep its new name. (There are a few exceptions. For example, if a file was intially named "blah-blah-long-file-name-_trunc.txt", then on rerunning this script, that file would change to "blah-blah-long-file-name_trunc.txt".) Because the transformation is usually idempotent, you can freely run this script in a directory where most files and folders have already been renamed without worrying about changing them further. If you Ctrl+c to abort this script part of the way through running it, you can just rerun the script later, and it should effectively pick up where you left off because you won't have to consider the files and folders you've already renamed.


# Example

Suppose you have the following folders and file name:

foo bar/Podcast episodes!/Invertebrates? Let's talk about 'em/Episode no. 73: Snails (and slugs). Here's some info all citizens should know.mp3

Running this Python script from the `foo bar/` directory would change the file and folder names to the following:

foo bar/Podcast-episodes/Invertebrates_Lets-talk-about-em/Episode-no-73_Snails_and-slugs_Heres-some-info-all-citizens-shou_trunc.mp3

(In case you're wondering, I made up all these file and folder names. This is not a real podcast.)
"""

#import pdb
import os
import re
import unicodedata

MAX_LENGTH_OF_BASE_NAME = 70
TRUNCATION_INDICATOR = "_trunc"
LENGTH_BEFORE_TRUNCATION_INDICATOR = MAX_LENGTH_OF_BASE_NAME - len(TRUNCATION_INDICATOR)
MIN_NUMBER_OF_NON_TAG_CHARS_WE_SHOULD_KEEP_IN_BASE_NAME = 20
ASK_FOR_BASE_FILENAME_STRING = "please choose a custom, non-empty base filename for this file. Type it and then press ENTER. (What do I mean by 'base filename'? The filename without its extension. For example, in the filename 'foo-bar.txt', the base filename is 'foo-bar'.)\n"
ASK_FOR_FOLDER_NAME = "please choose a custom, non-empty folder name. Type it and then press ENTER.\n"

various_extensions = ['.txt', '.md', '.html', '.htm', '.php', '.css', '.rss', '.tsv', '.csv', '.py', '.js', '.log', '.xml', '.tex', '.mp4', '.mov', '.qt', '.flv', '.mpeg', '.mpv', '.avi', '.wmv', '.m4v', '.m4p', '.m4a', '.webm', '.wav', '.mp3', '.aac', '.flac', '.au', '.ogg', '.mka', '.mkv', '.zip', '.gz', '.bz2', '.rar', '.tar', '.tgz', '.pdf', '.bin', '.jpg', '.jpeg', '.png', '.gif', '.tiff', '.itc']
various_extensions_plus_gpg = [ext + '.gpg' for ext in various_extensions]
various_extensions_plus_7z = [ext + '.7z' for ext in various_extensions]
various_extensions_plus_gz = [ext + '.gz' for ext in various_extensions]
various_extensions_plus_bz2 = [ext + '.bz2' for ext in various_extensions]
various_extensions_plus_gz_plus_gpg = [ext + '.gpg' for ext in various_extensions_plus_gz]
various_extensions_plus_gz_plus_7z = [ext + '.7z' for ext in various_extensions_plus_gz]
various_extensions_plus_bz2_plus_gpg = [ext + '.gpg' for ext in various_extensions_plus_bz2]
various_extensions_plus_bz2_plus_7z = [ext + '.7z' for ext in various_extensions_plus_bz2]
# In the following line, I concatenate the lists in reverse order of creating them, since I want the longer extensions to match if possible before the shorter, possibly sub-extensions would.
KNOWN_EXTENSIONS_WITH_MULTIPLE_DOTS = various_extensions_plus_bz2_plus_7z + various_extensions_plus_bz2_plus_gpg + various_extensions_plus_gz_plus_7z + various_extensions_plus_gz_plus_gpg + various_extensions_plus_bz2 + various_extensions_plus_gz + various_extensions_plus_7z + various_extensions_plus_gpg

LIST_OF_TLDS = ['com', 'org', 'net', 'gov', 'mil', 'edu', 'biz', 'me', 'name', 'info', 'io', 'ly', 'co', 'tv', 'us', 'ca', 'cn', 'fr', 'ch', 'au', 'in', 'de', 'jp', 'nl', 'uk', 'mx', 'no', 'ru', 'br', 'se', 'es'] # Selectively taken from [Mitchell (2007-2019)](https://www.lifewire.com/most-common-tlds-internet-domain-extensions-817511)
SOME_WORDS_WITH_DOTS_FOR_WHICH_WE_SHOULD_REMOVE_THE_DOTS = ['anon.', 'ca.', 'cf.', 'def.', 'e.g.', 'i.e.', 'ed.s', 'ed.', 'et al.', 'etc.', 'ibid.', 'illus.', 'ms.', 'mss.', 'n.d.', 'no.', 'pp.', 'pseud.', 'pub.', 'qtd.', 'trans.', 'viz.', 'vol.', 'vols.', 'vs.', 'a.i.', 'A.M.', 'cca.', 'cap.', 'cp.', 'C.V.', 'ff.', 'f.', 'i.a.', 'J.D.', 'lb.', 'lbs.', 'M.A.', 'M.O.', 'N.B.', 'op. cit.', 'p.a.', 'Ph.D.', 'P.M.', 'P.P.S.', 'P.S.', 'Q.E.D.', 'R.I.P.', 'stat.', 'ssp.', 'sp.', 'ex.', 'c.', 'p.'] # From http://scriptor.sprakverkstaden.uu.se/en/tools/conventions/abbreviations/text-abbrev/ and https://en.wikipedia.org/wiki/List_of_Latin_abbreviations . Note that I tried to put words that are substrings of other words later in this list, so that the longer form would match if possible rather than the shorter form. For example, I put 'p.' at the end so that it wouldn't match before other things like 'pp.', 'P.M.', 'P.P.S.', and so on have a chance to match, to avoid substituting on just the 'p.' part of those strings.
CHARS_TO_REPLACE_WITH_UNDERSCORE = ['[', ']', '(', ')', '{', '}', '^', '?', '|', '.', '\\', '"', '`', ':', ';', '!', '/']
LIST_OF_POSSIBLE_SUBSTRINGS_OF_MY_PERSONAL_TAGS = ['_trunc', '_ee', '_ntmc', '_ytmc', '_ncsd', '_ycsd', '_scdi', '_scdw', '_tio', '_ntor', '_ytor', '_eom', '_vao', '_i19', '_i20', '_p19', '_p20', '_npy', '_r19', '_r20', '_s19', '_s20'] # The tags are explained in https://briantomasik.com/organizing-computer-files/ . Some of these so-called substrings are the entire tag, while some actually are proper substrings of tags. "_i19" could be a substring of, say, "_i1997-08". Similar for the other numerical tags.

def truncate_without_cutting_off_tags(base_name):
	STRING_NOT_FOUND = -1
	smallest_index_of_a_found_tag = None
	for tag in LIST_OF_POSSIBLE_SUBSTRINGS_OF_MY_PERSONAL_TAGS:
		index = base_name.find(tag)
		if index != STRING_NOT_FOUND:
			if smallest_index_of_a_found_tag == None:
				smallest_index_of_a_found_tag = index
			else:
				smallest_index_of_a_found_tag = min(smallest_index_of_a_found_tag, index)
	if smallest_index_of_a_found_tag == None: # no tags found; just truncate normally
		return base_name[:LENGTH_BEFORE_TRUNCATION_INDICATOR] + TRUNCATION_INDICATOR
	else:
		if smallest_index_of_a_found_tag <= MIN_NUMBER_OF_NON_TAG_CHARS_WE_SHOULD_KEEP_IN_BASE_NAME + len(TRUNCATION_INDICATOR):
			return base_name # The base name is already as short as we can make it. Either it's too short to truncate at all or if we truncated it, the truncation indicator would take up at least as much length as the length we cut off of the name itself.
		else:
			ending_tags_to_keep = base_name[smallest_index_of_a_found_tag:]
			num_non_tag_chars_to_keep = max(LENGTH_BEFORE_TRUNCATION_INDICATOR - len(ending_tags_to_keep), MIN_NUMBER_OF_NON_TAG_CHARS_WE_SHOULD_KEEP_IN_BASE_NAME)
			truncation_indicator_if_not_already_present = TRUNCATION_INDICATOR if TRUNCATION_INDICATOR not in ending_tags_to_keep else ""
			return base_name[:num_non_tag_chars_to_keep] + truncation_indicator_if_not_already_present + ending_tags_to_keep

# A few lines of the following function were inspired by https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename
def normalize_base_name_string(base_name_string):
	base_name_string = base_name_string.replace('—', '-')
	renamed = unicodedata.normalize('NFKD', base_name_string).encode('ascii', 'ignore').decode('ascii') # See https://stackoverflow.com/questions/51710082/what-does-unicodedata-normalize-do-in-python
	name_starts_with_period = renamed.startswith('.')
	if name_starts_with_period:
		renamed = renamed[1:] # Remove the starting '.' for now; we'll add it back at the end.
	for tld in LIST_OF_TLDS:
		renamed = renamed.replace('.' + tld, '-dot-' + tld)
	renamed = renamed.replace('www.', 'www-dot-')
	for cur_word in SOME_WORDS_WITH_DOTS_FOR_WHICH_WE_SHOULD_REMOVE_THE_DOTS:
		renamed = re.sub('(\W)' + re.escape(cur_word), r'\1' + re.escape(cur_word.replace('.', '')), renamed, flags=re.IGNORECASE) # Inspired by the answer by "Blair Conrad" and "aland" on https://stackoverflow.com/questions/919056/case-insensitive-replace
	renamed = re.sub('\s+-\s*', '_', renamed) # A space plus a '-' suggests a somewhat significant break, worthy of a '_' separator.
	renamed = re.sub('\s*-\s+', '_', renamed)
	for i in range(2):
		renamed = re.sub('(\d)\.(\d)', r'\1-\2', renamed) # Replace a dot separating digits with a '-'. Examples: '1.00' goes to '1-00' and '12.31.2018' goes to '12-31-2018'.
		# I run this twice times because, for example, if you run it just once, then '2.4.17' would become '2-4.17' because the '4' was already part of the first match and can't be part of the second. Consider this example input: '1.1.1.1.1.1.1'. Running the substitution once gives '1-1.1-1.1-1.1'. Then running the substitution again gives '1-1-1-1-1-1-1'.
	for cur_char in CHARS_TO_REPLACE_WITH_UNDERSCORE:
		renamed = re.sub('\s*' + re.escape(cur_char) + '\s*', '_', renamed)
	renamed = re.sub('\s*\$\s*', ' USD', renamed)
	renamed = re.sub('\s*%\s*', ' percent ', renamed)
	renamed = re.sub('\s*&\s*', ' and ', renamed)
	renamed = re.sub('\s*\*\s*', ' star ', renamed)
	renamed = re.sub('\s*\+\s*', ' plus ', renamed)
	renamed = re.sub('\s*<\s*', ' lessthan ', renamed)
	renamed = re.sub('\s*=\s*', ' equals ', renamed)
	renamed = re.sub('\s*>\s*', ' greaterthan ', renamed)
	renamed = re.sub('\s*@\s*', ' at ', renamed)
	renamed = re.sub('\s*~\s*', ' tilde ', renamed)
	renamed = re.sub('[^\w\s-]', '', renamed) # Get rid of all other non-allowed characters.
	renamed = re.sub('\s+', '-', renamed)
	renamed = re.sub('_-+', '_', renamed)
	renamed = re.sub('-+_', '_', renamed)
	renamed = re.sub('--+', '_', renamed)
	renamed = re.sub('__+', '_', renamed)
	renamed = re.sub('^_+', '', renamed) # Get rid of any '_' at the very start of the string.
	renamed = re.sub('^-+', '', renamed)
	renamed = re.sub('_+$', '', renamed) # Get rid of any '_' at the very end of the string.
	renamed = re.sub('-+$', '', renamed)
	if name_starts_with_period:
		renamed = '.' + renamed # Add the period back now that the removal regex stuff is done.
	if len(renamed) > MAX_LENGTH_OF_BASE_NAME:
		renamed = truncate_without_cutting_off_tags(renamed)
		renamed = renamed.replace('__', '_') # just in case we introduced a double underscore during truncation
	return renamed

def handle_exceptional_cases(root, orig_file_or_folder, renamed_base_name, extension, string_asking_for_custom_name):
	if renamed_base_name == "":
		renamed_base_name = input("For '{}', the current proposed renaming would make the name empty. To avoid that, {}".format(orig_file_or_folder, string_asking_for_custom_name))
		renamed_base_name = handle_exceptional_cases(root, orig_file_or_folder, renamed_base_name, extension, string_asking_for_custom_name)
	if os.path.exists(os.path.join(root, renamed_base_name + extension)):
		renamed_base_name = input("Can't rename to '{}' because something already exists at that location. Instead, {}".format(renamed_base_name + extension, string_asking_for_custom_name))
		renamed_base_name = handle_exceptional_cases(root, orig_file_or_folder, renamed_base_name, extension, string_asking_for_custom_name)
	return renamed_base_name

def splitext_while_preserving_multiple_extensions(filename):
	for multipart_extension in KNOWN_EXTENSIONS_WITH_MULTIPLE_DOTS:
		if filename.endswith(multipart_extension):
			return filename[:-len(multipart_extension)], multipart_extension
	return os.path.splitext(filename)

def rename_file_or_folder(root, orig_file_or_folder, confirm_each_rename):
	this_is_a_dir = os.path.isdir(os.path.join(root, orig_file_or_folder))
	string_asking_for_custom_name = ASK_FOR_FOLDER_NAME if this_is_a_dir else ASK_FOR_BASE_FILENAME_STRING
	if this_is_a_dir:
		base_name = orig_file_or_folder
		extension = ""
	else:
		base_name, extension = splitext_while_preserving_multiple_extensions(orig_file_or_folder)
	renamed_base_name = normalize_base_name_string(base_name)
	if renamed_base_name == base_name:
		return # No renaming necessary.
	print()
	renamed_base_name = handle_exceptional_cases(root, orig_file_or_folder, renamed_base_name, extension, string_asking_for_custom_name)
	if len(renamed_base_name) > MAX_LENGTH_OF_BASE_NAME:
		print("Warning: Despite truncation, the renamed base name still exceeds {} characters because there were a lot of tags needing to be kept in the base name.".format(MAX_LENGTH_OF_BASE_NAME))
	if confirm_each_rename:
		option = input("I propose to rename the first line below to the second:\n{}\n{}\nJust press ENTER if that looks ok, enter 's' to skip renaming for this item, or enter 'c' if you'd like to instead give your own custom name that you want to rename to. ".format(orig_file_or_folder, renamed_base_name + extension))
		if option == 's':
			return
		elif option == 'c':
			renamed_base_name = input("Ok, so {}".format(string_asking_for_custom_name))
			renamed_base_name = handle_exceptional_cases(root, orig_file_or_folder, renamed_base_name, extension, string_asking_for_custom_name)
	orig_name_with_path = os.path.join(root, orig_file_or_folder)
	new_name_with_path = os.path.join(root, renamed_base_name + extension)
	assert not os.path.exists(new_name_with_path)
	os.rename(orig_name_with_path, new_name_with_path)
	print("Renamed '{}' to '{}'".format(orig_name_with_path, new_name_with_path))

def main():
	answer = input("Enter 'f' if you want to just run this script without confirming the changes. (Your interaction will still be required in exceptional circumstances.) To instead get the default behavior of this script where you'll have to confirm each renaming, just press ENTER now. ")
	confirm_each_rename = answer != 'f'
	for root, dirs, files in os.walk('.', topdown=False):
		for cur_file in files:
			rename_file_or_folder(root, cur_file, confirm_each_rename)
		for cur_dir in dirs:
			rename_file_or_folder(root, cur_dir, confirm_each_rename)

if __name__ == "__main__":
	main()