# Crossvalidate class from __future__ import division import util import copy import combiner import numpy import pdb import cPickle import os import to_latex from time import localtime from rpy import r as rc __author__ = "Brian Tomasik" __date__ = "April/May 2009" class Crossvalidate(object): """ Perform crossvalidation using the Combiner class. """ def __init__(self, verbosity=1, basedir = "../../.."): self.verbosity = verbosity self.basedir = basedir self.cv_folds = self._read_cv_folds() def _progress(self, message): """ Write info if high enough verbosity. """ if self.verbosity > 0: util.info(message) def tune_param(self, values_to_try, param_type, kwargs, special_name=None, tagtypes_to_run=["Pandora Genres","Pandora Acoustic"]): """ Runs crossvalidation for a number of different parameter values. Required params: - values_to_try: a list of the different parameter values at which to run (e.g., [100, 1000, 10000] for param_type=="mcmc_reps"). - param_type: the name of a parameter that's accepted as an input argument to the Combiner class's constructor (e.g., "min_tag_count"). - kwargs: a dictionary of parameter-value pairs that you want passed to the Combiner class's constructor in addition to the parameter currently being tuned. For instance, if you wanted to set "max_n_songs" at 50 while tuning "mcmc_reps", you would set this argument to {"max_n_songs": 50}. USUALLY, JUST SET THIS TO {}. Optional params: - special_name: If this isn't None, the output directory will have this string in place of param_type as an identifier. This parameter is useful when you want to differentiate a parameter sweep done with unusual kwargs values from one done with the default values. - tagtypes_to_run: Can be any list of items from this list: ["Pandora Genres", "Pandora Acoustic", "All", "Last.fm"] The default is ["Pandora Genres","Pandora Acoustic"] for the ISMIR '09 paper runs. No return value. Rather, this function creates a new directory with a datestamp corresponding to the hour you ran this function (in EST). Lots of output files are generated and put there: - Files ending in ".pkl" are pickle files of all the results generated on that round of crossvalidation. Use these when you want to play around with the results further in Python rather than just viewing them dumped out. - Files ending in "_betas.tab" give readable results on the level of individual tags. - Files ending in "_formatted.txt" give dumps of the dictionaries in the corresponding pickle files, starting with the most salient results at the top. - In addition to per-param-setting files, there are "overall" files, which have the dictionary of individual-parameter-results dictionaries. Also, "overall.tex" has the same output as would be generated by running this command in the results directory: python ../to_latex.py -p "overall.pkl" This command generates roughly the right LaTeX code for inclusion as a table in a .tex document. Note that "overall.tex" does not itself compile into a LaTeX document, since it lacks the headings, etc. """ # Transfer our parameter settings to the kwargs that will be used to # generate Combiner instances. kwargs["verbosity"] = self.verbosity kwargs["basedir"] = self.basedir datestamp = self._datestamp() if special_name is not None: # Name this a special directory, not just the current param_type. dirname = "%s_tuning_%s" % (special_name, datestamp) else: dirname = "%s_tuning_%s" % (param_type, datestamp) self._make_dir(dirname) per_tagtype_results = dict() for tagtype in tagtypes_to_run: kwargs["tagtype"] = tagtype per_param_results = dict() for val in values_to_try: kwargs[param_type] = val param_settings = "%s_%s=%s" % (tagtype, param_type, val) cur_outfile_stem = "%s/%s" % (dirname, param_settings) self._progress("Doing CV for %s." % param_settings) per_param_results[val] = self.get_results(kwargs, outfile_stem=cur_outfile_stem) per_tagtype_results[tagtype] = per_param_results self._write_pickle("%s/overall" % dirname, per_tagtype_results) util.write_file("%s/overall.tex" % dirname, to_latex.LatexWriter().param_sweep_table(per_tagtype_results)) self._write_pickle("%s/ttests" % dirname, self._get_ttests_dict(per_tagtype_results)) def generate_readable_tab_files(self): """ Runs crossvalidation four times, with the regression model alternately set to All3&P, CB, CF, and WD. The desired files end with "_betas.tab". No parameters or return value. See the documentation for the tune_param function for an explanation of the directory and output files generated. """ self.tune_param(["CB and WD and CF and P", "CB", "CF", "WD"], "regmodel", {}) def try_all_regression_models(self): """ Runs crossvalidation on all the models, including subsets of the sources and the Random model. No parameters or return value. See the documentation for the tune_param function for an explanation of the directory and output files generated. """ self.tune_param(["CB and WD and CF and P", "CB and WD and CF", "CB and WD and CF and P and I", "CB", "CF", "WD", "CB and WD", "CB and CF", "WD and CF", "Random"], "regmodel", {}) def try_all_regression_types(self): """ Runs crossvalidation on all the models, including subsets of the sources and the Random model. No parameters or return value. See the documentation for the tune_param function for an explanation of the directory and output files generated. """ self.tune_param(combiner.ALL_REGRESSIONS, "regtype", {}) def _datestamp(self): cur_time = localtime() return "%d-%d-%d-%d" % (cur_time[0], cur_time[1], cur_time[2], cur_time[3]) def _make_dir(self, dirname): try: os.mkdir(dirname) except OSError, e: if e[0] == 17: # directory exists pass else: raise e def _write_pickle(self, filename_stem, results, also_save_txt=True): self._progress("Pickling results dictionary.") file = open("%s.pkl" % filename_stem, 'wb') cPickle.dump(results, file, cPickle.HIGHEST_PROTOCOL) file.close() if also_save_txt: util.write_file("%s_dict.txt" % filename_stem, str(results)) def get_results(self, kwargs, write_to_file=True, write_pickle=True, outfile_stem="results", write_normalized_betas=True): """ Runs 5 folds of crossvalidation and saves the results. Required params: - kwargs: a dictionary of parameter-value pairs that you want passed to the Combiner class's constructor. For instance, if you wanted to set "verbosity" to 0, you would set this argument to {"max_n_songs": 50}. USUALLY, JUST SET THIS TO {}. Optional params: - write_to_file: Should we write the results to a file called "%s_formatted.txt" % outfile_stem ? - write_pickle: Should we save the results in a pickle called "%s.pkl" % outfile_stem ? - outfile_stem: The beginning of the output file names. - write_normalized_betas: Should we write a human-readable file with per-tag results, including normalized beta values? If so, it will be stored in a file called "%s_betas.tab" % outfile_stem . """ self._progress("Getting CV results.") per_fold_results = dict() per_fold_beta = dict() per_fold_best_worst_songs = dict() end_results = dict() n_folds = len(self.cv_folds) # 5 folds combiner_kwargs = copy.deepcopy(kwargs) # The next line mutates the kwargs, so avoid doing that to the original. combiner_kwargs = self._prune_tags(combiner_kwargs) for fold_no in range(n_folds): train_index = fold_no % n_folds test_index = (train_index + 1) % n_folds self._progress("Doing CV with train_index=%d, test_index=%d." % (train_index, test_index)) c = combiner.Combiner(fold_no=fold_no, **combiner_kwargs) per_fold_results[fold_no] = c.evaluate_regression(training_songs=self.cv_folds[train_index], testing_songs=self.cv_folds[test_index]) per_fold_beta[fold_no] = c.beta per_fold_best_worst_songs[fold_no] = c.best_worst_songs (end_results["results_each_fold"], end_results["per_tag_avg"], end_results["overall_avg_list"], end_results["overall_avg"]) = self._combine_fold_vals(per_fold_results, input_is_beta=False) (end_results["beta_each_fold"], end_results["per_tag_avg_beta"], end_results["overall_avg_beta_list"], end_results["overall_avg_beta"]) = self._combine_fold_vals(per_fold_beta, input_is_beta=True) end_results["best_worst_songs"] = self._combine_folds_best_worst_songs(per_fold_best_worst_songs) # Build a string-buffer-like list of the text to output using "\n".join(output_list) output_list = [] output_list.append("-----") output_list.append("regtype = %s" % c.regtype) output_list.append("tagtype = %s" % c.tagtype) output_list.append("regmodel = %s" % c.regmodel) output_list.append("min tag count = %d" % c.min_tag_count) output_list.append("min feature count %d" % c.min_feature_count) output_list.append("\nOverall average stats:") output_list.append("--------------") output_list.append("%s\n" % str(end_results["overall_avg"])) output_list.append("Overall average beta:") output_list.append("--------------") output_list.append("%s\n" % str(end_results["overall_avg_beta"])) output_list.append("Best/worst songs:") output_list.append("--------------") output_list.append("%s\n" % str(end_results["best_worst_songs"])) output_list.append("Average stats:") output_list.append("--------------") output_list.append("%s\n" % str(end_results["per_tag_avg"])) output_list.append("Average beta:") output_list.append("--------------") output_list.append("%s\n" % str(end_results["per_tag_avg_beta"])) output_list.append("Per-fold stats:") output_list.append("--------------") output_list.append("%s\n" % str(end_results["results_each_fold"])) output_list.append("Per-fold beta:") output_list.append("--------------") output_list.append("%s\n" % str(end_results["beta_each_fold"])) output_list.append("\n\n") # Get the string, write it, and possibly print it. output_string = "\n".join(output_list) if write_to_file: util.write_file("%s_formatted.txt" % outfile_stem, output_string) if write_pickle: self._write_pickle(outfile_stem, end_results, also_save_txt=False) if write_normalized_betas: util.write_file("%s_betas.tab" % outfile_stem, self._readable_tab_file(end_results["per_tag_avg"], end_results["per_tag_avg_beta"], end_results["best_worst_songs"])) self._progress(output_string[:1000]) return end_results def _readable_tab_file(self, per_tag_avg, per_tag_avg_beta, per_tag_best_worst_songs): # Store a list of lists of the form [tag, autotag_beta, propagated_beta, web_beta, auc, map, rprec, 10prec, topsong1, song1ingroundtruth?, topsong2, ...], if our current run has all of those available. If not, just omit the betas. separator = "\t" example_tag = per_tag_avg_beta.keys()[0] sources = per_tag_avg_beta[example_tag].keys() have_all_three_betas = "CB" in sources and "CF" in sources and "WD" in sources # Make header row. header_list = ["Tag"] if have_all_three_betas: header_list.extend(["CB-BetaFrac", "CF-BetaFrac", "WD-BetaFrac"]) header_list.extend(["AUC", "MAP", "10-Prec", "R-Prec"]) for i in range(len(per_tag_best_worst_songs[example_tag]["Best Song"])): header_list.append("Top5Songs#%d" % i) header_list.append("#%dCorrect?" % i) for i in range(len(per_tag_best_worst_songs[example_tag]["Worst Song"])): header_list.append("Bottom5Songs#%d" % i) header_list.append("#%dCorrect?" % i) # Start the output list, which will become the output string with a "\n".join(output_list) output_list = [separator.join(header_list)] # Get the list of lists to add and sort by CB beta fraction. tag_lists = [] for (tag, source_dict) in per_tag_avg_beta.iteritems(): list_for_cur_line = [tag] if have_all_three_betas: try: autotag_beta = source_dict["CB"]["beta"] propagated_beta = source_dict["CF"]["beta"] web_beta = source_dict["WD"]["beta"] sum_of_betas = propagated_beta + autotag_beta + web_beta list_for_cur_line.extend([autotag_beta / sum_of_betas, propagated_beta / sum_of_betas, web_beta / sum_of_betas]) except: list_for_cur_line.extend(["(missing)" for counter in range(3)]) # Now add results info. results_dict = per_tag_avg[tag] list_for_cur_line.extend([results_dict["AUC"], results_dict["MAP"], results_dict["10-Prec"], results_dict["R-Prec"]]) # Now add best/worst songs. list_for_cur_line.extend(self._convert_best_or_worst_songs_list(per_tag_best_worst_songs[tag]["Best Song"])) list_for_cur_line.extend(self._convert_best_or_worst_songs_list(per_tag_best_worst_songs[tag]["Worst Song"])) # Done with this line. tag_lists.append(list_for_cur_line) # Now, sort by CB's beta fraction in decreasing order. tag_lists.sort(key=lambda tuple: tuple[1], reverse=True) for tag_list in tag_lists: output_list.append(separator.join(map(lambda x: str(x), tag_list))) return "\n".join(output_list) def _convert_best_or_worst_songs_list(self, best_or_worst_songs_list): """ Prepare the contents of best_song_list or worst_song_list for output. """ N_BEST_OR_WORST_SONGS = 5 out_list = [] for i in range(N_BEST_OR_WORST_SONGS): try: (artist_and_song, in_ground_truth) = best_or_worst_songs_list[i] out_list.append(artist_and_song) out_list.append(in_ground_truth) except IndexError: out_list.append("(missing)") out_list.append("(missing)") return out_list def _prune_tags(self, combiner_kwargs): # Figure out which tags to use. self._progress("Pruning CV tags.") nonrare_tags = None # IMPORTANT: We need to make sure we use the same set of tags for all regression models, both those that use fewer and those that use more features. So we need to act as though we're going to use all the features here, so that we get the most restrictive tag set. Therefore, change combiner_kwargs. temp_kwargs_for_pruning = copy.deepcopy(combiner_kwargs) temp_kwargs_for_pruning["regmodel"] = "CB and WD and CF and P and I" for (fold_no, song_set) in self.cv_folds.items(): c = combiner.Combiner(fold_no=fold_no, **temp_kwargs_for_pruning) cur_fold_nonrare_tags = c.nonrare_tags(song_set) # Update using that info. if nonrare_tags is None: nonrare_tags = cur_fold_nonrare_tags else: nonrare_tags = nonrare_tags.intersection(cur_fold_nonrare_tags) try: orig_value_only_these_tags = temp_kwargs_for_pruning["only_these_tags"] assert util.is_subset(nonrare_tags, orig_value_only_these_tags), "Nonrare tags shouldn't include any more than you started with...." except KeyError: pass combiner_kwargs["only_these_tags"] = nonrare_tags return combiner_kwargs def _combine_fold_vals(self, per_fold_results, input_is_beta): results_each_fold = dict() for fold_results in per_fold_results.values(): for (tag, val_dict) in fold_results.iteritems(): cur_tag_dict = results_each_fold.get(tag, {}) for (val_id, val) in val_dict.iteritems(): if not input_is_beta: cur_tag_dict.setdefault(val_id, []).append(val) else: # beta dictionaries have an extra level cur_source_dict = cur_tag_dict.get(val_id, {}) for (stat, number) in val.iteritems(): cur_source_dict.setdefault(stat, []).append(number) cur_tag_dict[val_id] = cur_source_dict results_each_fold[tag] = cur_tag_dict per_tag_avg = self._per_tag_avg(results_each_fold, input_is_beta) (overall_avg_list, overall_avg) = self._overall_avg(per_tag_avg, input_is_beta) return (results_each_fold, per_tag_avg, overall_avg_list, overall_avg) def _per_tag_avg(self, results_each_fold, input_is_beta): N_FOLDS = 5 per_tag_avg = dict() for (tag, val_dict) in results_each_fold.iteritems(): for (val_id, val_list) in val_dict.iteritems(): cur_tag_dict = per_tag_avg.get(tag, {}) if not input_is_beta: cur_tag_dict[val_id] = util.mean_if_numeric(val_list) else: # beta dictionaries have an extra level cur_source_dict = dict() for (stat, number_list) in val_list.iteritems(): cur_source_dict[stat] = util.mean_if_numeric(number_list) cur_tag_dict[val_id] = cur_source_dict per_tag_avg[tag] = cur_tag_dict return per_tag_avg def _overall_avg(self, per_tag_avg, input_is_beta): """ NOTE: The std errors returned here are over each tag, but don't count the 5 folds of CV for each tag. So actual std errors are those divided by sqrt(5). """ overall_avg_list = dict() for val_dict in per_tag_avg.values(): for (val_id, avg_val) in val_dict.iteritems(): if not input_is_beta: overall_avg_list.setdefault(val_id, []).append(avg_val) else: dict_of_avg_lists = overall_avg_list.get(val_id, {}) for (stat, number) in avg_val.iteritems(): dict_of_avg_lists.setdefault(stat, []).append(number) overall_avg_list[val_id] = dict_of_avg_lists overall_avg = dict() for (val_id, avg_list) in overall_avg_list.iteritems(): if not input_is_beta: overall_avg[val_id] = util.summary_stats(avg_list) else: dict_of_averages = dict() for (stat, number_list) in avg_list.iteritems(): dict_of_averages[stat] = util.summary_stats(number_list) overall_avg[val_id] = dict_of_averages return (overall_avg_list, overall_avg) def _combine_folds_best_worst_songs(self, orig_dict): combined_dict = dict() for (fold_no, tag_dict) in orig_dict.iteritems(): for (tag, info_dict) in tag_dict.iteritems(): combined_dict.setdefault(tag, {}) for (key, val) in info_dict.iteritems(): combined_dict[tag].setdefault(key, []).append(val) return combined_dict def _read_cv_folds(self): """ Returns a dict: fold # -> set of songs for that fold. """ self._progress("Reading CV folds.") cv_folds = dict() N_FOLDS = 5 for i_fold in range(N_FOLDS): cur_file = open("%s/lists/crossFold/part%i.tab" % (self.basedir, i_fold+1), "r") cv_folds[i_fold] = self._cv_get_songs(cur_file) cur_file.close() return cv_folds def _cv_get_songs(self, file): """ Read the song ids, one per line. """ songs = set() for line in file: line_list = line.rstrip().split("\t") songs.add(int(line_list[0])) return songs def _get_ttests_dict(self, per_tagtype_results): per_tagtype = dict() for (tagtype, per_param_results) in per_tagtype_results.iteritems(): per_tagtype.setdefault(tagtype, {}) for (val1, val1_results) in per_param_results.items(): for (val2, val2_results) in per_param_results.items(): cur_pair = str(sorted([val1, val2])) if val1 != val2 and cur_pair not in per_tagtype[tagtype]: per_tagtype[tagtype][cur_pair] = self._individual_ttest_dict(val1_results["per_tag_avg"], val2_results["per_tag_avg"]) return per_tagtype def _individual_ttest_dict(self, per_tag_dict1, per_tag_dict2): STATS_TO_COMPARE = ["AUC", "MAP", "R-Prec", "10-Prec"] cur_dict = dict() for stat in STATS_TO_COMPARE: differences = [] for (tag, dict1_stats) in per_tag_dict1.iteritems(): try: dict2_stats = per_tag_dict2[tag] except KeyError: continue try: val1 = dict1_stats[stat] val2 = dict2_stats[stat] differences.append(val1-val2) except KeyError: continue if len(differences) > 0: ttest_dict = rc.t_test(differences) del ttest_dict["data.name"] # That's a long and ugly value. cur_dict[stat] = ttest_dict return cur_dict def main(): cv = Crossvalidate() cv.generate_readable_tab_files() #cv.try_all_regression_models() #cv.try_all_regression_types() if __name__ == "__main__": main()