# Combiner class

from __future__ import division
import util
import numpy
import rpy
import random
from rpy import r as rc
from math import log, sqrt
import pdb
import sys

__author__ = "Brian Tomasik"
__date__ = "April/May 2009"

# Set numpy printing options; see http://www.scipy.org/Numpy_Example_List_With_Doc#head-cc1302f5e9e57de71b578cf25e8a9ffd8aa3a707
numpy.set_printoptions(precision=2)
numpy.set_printoptions(suppress=True)

ALL_REGRESSIONS = ["Independent Linear", "Hierarchical Linear", "Hierarchical Mixture", "Independent Logistic","Hierarchical Logistic","Product","Sum","Min","Max","Median"]#, "Random Guess"] <-- Random Guess still works, but not needed.
ALL_TAGTYPES = ["Pandora Genres", "Pandora Acoustic", "All", "Last.fm"]
ALL_SOURCES = ["CB", "CF", "WD", "P", "I", "Web Binary", "Web Page Percent", "CF2", "CF3", "CF4", "CF5", "CF6", "Random"]

class Combiner(object):
    """
    A Combiner reads input .tab files relative to the directory "basedir"
    and uses regression to combine them.
    """
    def __init__(self, only_these_tags=None, regtype="Independent Linear", tagtype="All", regmodel="CB and WD and CF and P", verbosity=1, production_run=True, basedir = "../../..", fold_no=None, min_tag_count=1, min_feature_count=0, overwrite_final_tab_file=True, mcmc_reps=15000, max_n_songs="INF", ncomp=2, add_intercept_as_first_feature=True, force_betas_nonneg_except_scrobble_or_if_interactions=True):
        self.regtype = regtype
        self.regmodel = regmodel
        self.tagtype = tagtype
        self._verify_regmodel_written_correctly()
        self.verbosity = verbosity
        self.overwrite_final_tab_file = overwrite_final_tab_file
        self.mcmc_reps = mcmc_reps
        self.max_n_songs = max_n_songs
        self.ncomp = ncomp # only used for "Hierarchical Mixture" regtype
        self.production_run = production_run # If True, disable all debugging shortcuts.
        self.basedir = basedir
        self.fold_no = fold_no
        self.min_tag_count = min_tag_count
        self.min_feature_count = min_feature_count
        self.add_intercept_as_first_feature = add_intercept_as_first_feature
        self.force_betas_nonneg_except_scrobble_or_if_interactions = force_betas_nonneg_except_scrobble_or_if_interactions if "I" not in regmodel else False
        self._read_tag_set(self.tagtype) # get some values for self.only_these_tags
        if only_these_tags is not None:
            self.only_these_tags = only_these_tags.intersection(self.only_these_tags)
        self.beta = dict() # dict: tag -> (dict: source -> (dict: regression statistic -> value)); in particular, the "regression statistics" at least include beta, but possibly also p-values, etc.
        self.stats = dict() # dict: tag -> (dict: stat_type -> value)
        self.best_worst_songs = dict() # dict: tag -> (dict: good/bad -> songid)
        self.songid_to_song = self._get_songid_to_song()
        self._set_or_reset_data_dictionaries()

    def _read_tag_set(self, tagtype):
        self._progress("Reading tags of type %s." % tagtype)
        self.only_these_tags = set()
        if tagtype=="All":
            self._read_pandora_results(just_read_tags=True)
            self._read_lastfm_results(just_read_tags=True)
        elif tagtype=="Last.fm":
            self._read_lastfm_results(just_read_tags=True)
        elif tagtype=="Pandora Genres":
            self._read_pandora_results(just_read_tags=True)
            self.only_these_tags = self.only_these_tags.intersection(self._get_pandora_tag_set("%s/lists/Pandora/PandoraGenreTag.tab" % self.basedir))
        elif tagtype=="Pandora Acoustic":
            self._read_pandora_results(just_read_tags=True)
            self.only_these_tags = self.only_these_tags.intersection(self._get_pandora_tag_set("%s/lists/Pandora/PandoraAcousticTag.tab" % self.basedir))
        else:
            raise ValueError("Bad tagtype = %s." % tagtype)

    def _verify_regmodel_written_correctly(self):
        for source in self.regmodel.split(" and "):
            assert source in ALL_SOURCES, "Invalid regmodel: %s" % self.regmodel

    def _get_songid_to_song(self):
        file = open("%s/lists/songList.tab" % self.basedir)
        songid_dict = dict()
        for line in file:
            (songid_string, artist, song_name, mp3file) = line.rstrip().split("\t")
            songid_dict[int(songid_string)] = "%s, by %s" % (song_name, artist)
        file.close()
        return songid_dict

    def _set_or_reset_data_dictionaries(self):
        """
        Write over old values so that the testing phase can start anew.
        """
        self.only_these_songs = None
        self.all_seen_songs = set()
        self.features = dict()  # dict: tag -> (dict: source -> (dict: songid -> score))
        self.ground_truth = dict()  # dict: tag -> dict of songid -> score
        self.song_lists = dict() # dict: tag -> set of songs used for this tag
        self.sorted_sources = dict() # tag -> ordered list of sources for this X and beta
        self.X = dict() # dict: tag -> X matrix
        self.y = dict() # dict: tag -> X matrix
        self.yhat = dict() # dict: tag -> X matrix

    def _remove_tag(self, tag, verbosity=1):
        if verbosity > 0:
            util.info("\tWARNING: Removing tag = %s." % tag)
        self.only_these_tags.remove(tag)
        if tag in self.features:
            del self.features[tag]
        if tag in self.ground_truth:
            del self.ground_truth[tag]
        if tag in self.song_lists:
            del self.song_lists[tag]
        if tag in self.beta:
            del self.beta[tag]
        if tag in self.stats:
            del self.stats[tag]
        if tag in self.sorted_sources:
            del self.sorted_sources[tag]
        if tag in self.X:
            del self.X[tag]
        if tag in self.y:
            del self.y[tag]
        if tag in self.yhat:
            del self.yhat[tag]

    def nonrare_tags(self, only_these_songs):
        """
        Return the tags that weren't pruned upon reading the input data.
        
        Required params:
        - only_these_songs: A set of the only songs we should consider
        from the input.
        """
        self._progress("Computing nonrare tags.")
        self._read_data(only_these_songs=only_these_songs)
        return self.only_these_tags # Will have been pruned by getting X and y.

    def evaluate_regression(self, training_songs=None, testing_songs=None):
        """
        Get results for a single training-and-testing fold of crossvalidation.
        
        Optional params:
        - training_songs: The set of songs to which to restrict ourselves in
        training. If None, then use no restrictions.
        - testing_songs: The same for testing.
        """
        assert len(training_songs.intersection(testing_songs))==0, "Can't test on training songs!"
        # Training.
        self._read_data(only_these_songs=training_songs) # Will update self.all_seen_songs.
        self._make_X_and_y_matrices()
        self._compute_betas()
        # Testing.
        self._set_or_reset_data_dictionaries()
        self._read_data(only_these_songs=testing_songs)
        self._make_X_and_y_matrices()
        self._compute_yhat()
        self._performance_stats()
        if self.verbosity > 2:
            self._progress(self.stats)
            self._progress(str(self.beta))
        return self.stats

    def fill_in_zeros(self, outfile="final.tab"):
        """
        Generate a final song-score matrix as "outfile" for use by a query
        evaluator.
        
        Optional params:
        - outfile: The file to which to write the final matrix.
        """
        self._read_data(only_tags_have_all_sources=False)
        self._make_X_and_y_matrices()
        self._compute_betas()
        self._compute_yhat()
        self._write_final(outfile=outfile)
        print self.beta

    def _progress(self, message, newline=True):
        """
        Write info if high enough verbosity.
        """
        if self.verbosity > 0:
            util.info(message, newline=newline)

    def _read_data(self, only_these_songs=None, only_tags_have_all_sources=True, seed_random_subset=True):
        """
        """
        self.only_these_songs = only_these_songs
        self._read_features()
        self._read_ground_truth()
        self._remove_uncommon_tags()
        if only_these_songs is not None:
            assert util.is_subset(self.all_seen_songs, only_these_songs), "Input files had songs that shouldn't have been used."
        if self.max_n_songs != "INF":
            if seed_random_subset:
                random.seed("This is just an arbitrary hash with which to seed random. I'm doing this so that we can compare different types of regressions on equal footing, without worrying about differences spuriously introduced by the particular subset that was chosen.")
            self.all_seen_songs = util.random_subset(self.all_seen_songs, self.max_n_songs)
        if only_tags_have_all_sources:
            self._remove_tags_without_all_sources()

    def _read_features(self):
        self._progress("Reading features.")
        if "CF" in self.regmodel or self.regmodel=="Random": # If self.regmodel=="Random", we need to choose some set of tags to use, so just use the same set as CF....
            self._add_source("%s/lists/propogate/propagated_real1.tab" % self.basedir, "CF") # Use this one!
        if "CF2" in self.regmodel:
            self._add_source("%s/lists/propogate/propagated_real2.tab" % self.basedir, "CF2")
        if "CF3" in self.regmodel:
            self._add_source("%s/lists/propogate/propagated_real3.tab" % self.basedir, "CF3")
        if "CF4" in self.regmodel:
            self._add_source("%s/lists/propogate/propagated_binary1.tab" % self.basedir, "CF4")
        if "CF5" in self.regmodel:
            self._add_source("%s/lists/propogate/propagated_binary2.tab" % self.basedir, "CF5")
        if "CF6" in self.regmodel:
            self._add_source("%s/lists/propogate/propagated_binary3.tab" % self.basedir, "CF6")
        if "CB" in self.regmodel:
            if self.fold_no is None:
                self._add_source("%s/lists/autoTags/crossFold/six_five_sec/normalized_all_TSS.tab" % self.basedir, "CB")
            else:
                part_num = self.fold_no+1+2 # Add 1 because the fold numbers are 1-indexed, not 0-indexed. Add another 2 for some unknown reason. Fold 0 corresponds to part3, fold 1 to part4, etc.
                if part_num > 5:
                    part_num -= 5 # change 6 to 1, 7 to 2
                self._add_source("%s/lists/autoTags/crossFold/six_five_sec/normalized_part%d_TSS.tab" % (self.basedir, part_num), "CB")
        if "Web Binary" in self.regmodel:
            self._add_source("%s/lists/web/greppy_binary.tab" % self.basedir, "Web Binary")
        # Take sqrts because count data. Makes slightly more normal. See
        # http://cran.r-project.org/doc/contrib/Faraway-PRA.pdf (p. 84).
        if "Web Page Percent" in self.regmodel:
            self._add_source("%s/lists/web/greppy_pagepct.tab" % self.basedir, "Web Page Percent", transform=lambda x: sqrt(x))
        if "WD" in self.regmodel:
            self._add_source("%s/lists/web/greppy_rawcount.tab" % self.basedir, "WD", transform=lambda x: sqrt(x))
        self._read_scrobble("%s/lists/lastFM/song_scrobbler_data.tab" % self.basedir)
        self._compute_sorted_sources()

    def _add_source(self, filename, sourcename, transform=None, only_a_few_lines=True):
        """
        Update the features dict with data from the file named filename.
        Use the same name for the type of data source.
        """
        self._progress("Adding source = %s" % sourcename)
        file = open(filename, "r")
        line_no = 0
        for line in file:
            line_no += 1
            if line_no % 500 == 0:
                self._progress("cur line = %d" % line_no)
            if only_a_few_lines and not self.production_run and line_no > 200:
                util.info("\tWARNING: stopping at line 200 of input file. Turn off for production runs.")
                break
            cur_tag = self._read_tag(line)
            if cur_tag in self.only_these_tags:
                cur_dict = self._line_to_dict(line.rstrip().split("\t"), transform=transform)
                if cur_dict: # that is, if cur_dict is not empty
                    try:
                        source_dict = self.features[cur_tag]
                    except KeyError:
                        source_dict = dict()
                    try:
                        old_dict = source_dict[sourcename]
                        # If we get here, we need to merge the new
                        # cur_dict with the old one.
                        source_dict[sourcename] = self._merge_song_dicts(old_dict, cur_dict)
                    except KeyError: # We're adding a new source.
                        source_dict[sourcename] = cur_dict
                    self.features[cur_tag] = source_dict
        file.close()

    def _merge_song_dicts(self, dict1, dict2):
        """
        Destructively merge dict1 and dict2, taking the union of song-score
        values and, where songs match, taking the max of the scores.
        """
        # Add the entries of the shorter dict to the longer one. main is
        # longer than other.
        if len(dict1) >= len(dict2):
            main = dict1
            other = dict2
        else:
            main = dict2
            other = dict1
        for (other_key, other_val) in other.iteritems():
            try:
                main_val = main[other_key]
                main[other_key] = max(main_val, other_val)
            except KeyError:
                main[other_key] = other_val
        return main

    def _read_scrobble(self, filename):
        self.log_scrobble_counts = dict()
        self._progress("Reading Scrobble counts.")
        file = open(filename, "r")
        for line in file:
            (songid, score) = map(lambda str: int(str), line.rstrip().split("\t"))
            if self.only_these_songs is None or songid in self.only_these_songs:
                # Add 1+score to prevent log(0).
                self.log_scrobble_counts[songid] = log(score+1)
        file.close()
        self.avg_scrobble = numpy.mean(self.log_scrobble_counts.values())

    def _read_tag(self, line):
        """
        (Force all tags to lower case.)
        """
        (cur_tag, junk, junk2) = line.rstrip().partition("\t")
        return cur_tag.lower()

    def _line_to_dict(self, line_list, transform=None):
        """
        Parse [tag,song,score,song,score,...] to a dictionary.
        If not None, transform is a function that takes input float scalars.
        """
        # tag = line_list[0]
        d = dict()
        for i in range(1,len(line_list)):
            if i % 2 == 1:
                cur_songid = int(line_list[i])
                if self.only_these_songs is None or cur_songid in self.only_these_songs:
                    if transform is None:
                        d[cur_songid] = float(line_list[i+1])
                    else:
                        d[cur_songid] = transform(float(line_list[i+1]))
                    self.all_seen_songs.add(cur_songid)
        return d   

    def _get_pandora_tag_set(self, filename):
        file = open(filename, "r")
        tag_set = set()
        for line in file:
            tag_set.add(line.rstrip().lower()) # Make lower-case!
        file.close()
        return tag_set

    def _read_ground_truth(self):
        self._progress("Reading ground truth.")
        self._read_lastfm_results()
        self._read_pandora_results()
        #self._prune_low_count_tags() See my comment on the function itself for why it's commented out.

    def _read_lastfm_results(self, just_read_tags=False, song_weight=0.66666):
        assert song_weight >= 0 and song_weight <= 1, "Bad song weight."

        # Song results.
        last_fm_song_file = open("%s/lists/lastFM/song_results.tab" % self.basedir, "r")
        for line in last_fm_song_file:
            song_tag = self._read_tag(line)
            if just_read_tags:
                self.only_these_tags.add(song_tag)
            elif song_tag in self.only_these_tags:
                song_dict = self._line_to_dict(line.rstrip().split("\t"))
                assert song_tag not in self.ground_truth, "Duplicate tag in input file"
                self.ground_truth[song_tag] = song_dict
        last_fm_song_file.close()

        # Artist results.
        last_fm_artist_file = open("%s/lists/lastFM/song_artist_results.tab" % self.basedir, "r")
        for line in last_fm_artist_file:
            artist_tag = self._read_tag(line)
            if just_read_tags:
                self.only_these_tags.add(artist_tag)
            elif artist_tag in self.only_these_tags:
                artist_dict = self._line_to_dict(line.rstrip().split("\t"))
                try:
                    new_dict = self.ground_truth[artist_tag]
                    for (songid, artist_score) in artist_dict.items():
                        try:
                            song_score = new_dict[songid]
                            new_dict[songid] = song_weight * song_score + (1-song_weight) * artist_score
                        except KeyError: # song dict doesn't have this songid
                            new_dict[songid] = artist_score
                    self.ground_truth[artist_tag] = new_dict
                except KeyError: # tag doesn't have song info; just use artist info
                    self.ground_truth[artist_tag] = artist_dict
        last_fm_artist_file.close()

    def _read_pandora_results(self, just_read_tags=False):
        pandora_file = open("%s/lists/Pandora/PandoraTagSong.tab" % self.basedir, "r")
        for line in pandora_file:
            pandora_tag = self._read_tag(line)
            if just_read_tags:
                self.only_these_tags.add(pandora_tag)
            elif pandora_tag in self.only_these_tags:
                pandora_dict = self._line_to_dict(line.rstrip().split("\t"))
                try:
                    new_dict = self.ground_truth[pandora_tag]
                    for (songid, pandora_score) in pandora_dict.items():
                        try:
                            lastfm_score = new_dict[songid]
                            new_dict[songid] = max(lastfm_score, pandora_score)
                        except KeyError: # song dict doesn't have this songid
                            new_dict[songid] = pandora_score
                    self.ground_truth[pandora_tag] = new_dict
                except KeyError: # tag doesn't have lastfm info; just use pandora info
                    self.ground_truth[pandora_tag] = pandora_dict
        pandora_file.close()

    """
    I think this is unnecessary, given that I prune when making the X and y matrices....
    def _prune_low_count_tags(self):
        for (tag, song_dict) in self.ground_truth.items():
            if len(song_dict) < self.min_tag_count:
                self._remove_tag(tag, verbosity=0)
    """

    def _remove_uncommon_tags(self):
        feature_tags = set(self.features.keys())
        ground_truth_tags = set(self.ground_truth.keys())
        remove_these = feature_tags.symmetric_difference(ground_truth_tags)
        for tag in remove_these:
            self._remove_tag(tag, verbosity=0)
        if self.verbosity > 1:
            util.info("\tWARNING: Removing these tags not in intersection of features and ground truth: %s." % str(remove_these))
             
    def _compute_sorted_sources(self):
        """
        Preliminary experiments showed that intercept helps a lot, increasing 10precision from 0.05 to 0.06
        """
        self.sorted_sources = dict()
        for (tag, source_dict) in self.features.items():
            source_list = []
            if self.add_intercept_as_first_feature:
                source_list.append("intercept")
            # Need to include main effect if we also have scrobble interaction:
            # http://orgtheory.wordpress.com/2007/10/20/interaction-models/
            if "P" in self.regmodel:
                source_list.append("scrobble")
            # Find the other sources.
            tag_has_nontrivial_sources = False
            for (source, song_dict) in sorted(source_dict.items()): # sort so that features are always in same order
                if len(song_dict) >= self.min_feature_count:
                    source_list.append(source)
                    if "I" in self.regmodel:
                        source_list.append("%s_interaction" % source)
                    tag_has_nontrivial_sources = True
            if tag_has_nontrivial_sources:
                self.sorted_sources[tag] = source_list
            else:
                self._remove_tag(tag, verbosity=0)
    
    def _remove_tags_without_all_sources(self):
        try:
            n_sources = max([len(source_list) for source_list in self.sorted_sources.values()])
        except:
            pdb.set_trace()
        for (tag, source_list) in self.sorted_sources.items():
            if len(source_list) < n_sources:
                self._remove_tag(tag, verbosity=0)

    def _make_X_and_y_matrices(self):
        tag_num = 0
        n_tags = len(self.features)
        for (tag, sources_dict) in self.features.items():
            tag_num += 1
            self._progress("Reading tag %d of %d: %s" % (tag_num, n_tags, tag), newline=True) # rmme: make newline false
            self.song_lists[tag] = self._compute_song_list(sources_dict)
            if len(self.song_lists[tag]) > 0 and tag in self.ground_truth:
                self._cur_tag_X_and_y(tag)
            else:
                self._remove_tag(tag)
                assert False, """We shouldn't get here, because we should already have removed tags not shared by features and ground truth."""


    def _compute_song_list(self, sources_dict, use_entire_song_list=True):
        if use_entire_song_list:
            song_set = self.all_seen_songs
        else:
            song_set = set()
            for (source_name, source_d) in sources_dict.iteritems():
                song_set.update(set(source_d.keys())) # union of songs for all sources; could also do intersection
            song_set = song_set.intersection(self.all_seen_songs)
        song_list = list(song_set)
        random.seed("Another random seed, so that shuffling will be fixed though arbitrary.")
        random.shuffle(song_list)
        return song_list

    def _cur_tag_X_and_y(self, tag, test_random=False, all_ground_truth_binary=True):
        """
        """
        sorted_sources = self.sorted_sources[tag]
        cur_song_list = self.song_lists[tag]
        features_dict = self.features[tag]
        y = rc.matrix(self._dict_to_vec(self.ground_truth[tag], self.song_lists[tag]))
        if util.num_nonzeros(y) < self.min_tag_count:
            self._remove_tag(tag, verbosity=0)
            return
        # Possibly add intercept.
        ncol = 0
        x_vec = []
        if "intercept" in sorted_sources:
            ncol += 1
            x_vec.extend([1.0 for songid in cur_song_list])
        # Add Scrobble counts
        scrobble_vec = numpy.array([self.log_scrobble_counts.get(songid, self.avg_scrobble) for songid in cur_song_list])
        if "scrobble" in sorted_sources:
            ncol += 1
            x_vec.extend(self._standardize(scrobble_vec))
        # Add remaining features.
        for source in sorted_sources:
            if source.endswith("_interaction"):
                main_source = util.remove_trailing_string(source, "_interaction")
                main_vec = self._dict_to_vec(features_dict.get(main_source, None), cur_song_list)
                feature_vec = numpy.multiply(main_vec, scrobble_vec) # pointwise product
            elif source not in ["intercept", "scrobble"]:
                feature_vec = self._dict_to_vec(features_dict.get(source, None), cur_song_list)
            else:
                continue
            x_vec.extend(self._standardize(feature_vec))
            ncol += 1
        try:
            X = rc.matrix(x_vec,ncol=ncol)#dimnames=[[],x_sources])
        except:
            pdb.set_trace()
            # Temporary fix: rmme!
            self._remove_tag(tag)
            return
        if test_random and not self.production_run: # Erase all of the above and do some random numbers for testing.
            n_songs = len(X)
            X = numpy.random.standard_normal((n_songs,ncol))
            if ncol==1:
                y = 3 * X[:,0] + 0.5*numpy.random.standard_normal((1,n_songs))
            else:
                y = 3 * X[:,0] + X[:,1] + 0.5*numpy.random.standard_normal((1,n_songs))
            y = y.transpose()
        if all_ground_truth_binary or self.regtype=="Independent Logistic" or self.regtype=="Hierarchical Logistic":
            # Convert y to 0's and 1's.
            y = 1.0*(numpy.array(y)>0) # multiply by 1.0 to make Float
        self.X[tag] = X
        self.y[tag] = y

    def _standardize(self, vector):
        """
        Makes betas comparable.
        """
        mean = numpy.mean(vector)
        std = numpy.std(vector)
        if std==0: # This happens, e.g., when vector is all 1's, or just a single number.
            return [1.0 for val in vector] # We have to pick something....
        else:
            return map(lambda x: (x-mean)/std, vector)

    def _dict_to_vec(self, d, song_list):
        """
        """
        if d is None:
            # This acts as though we have information we don't. Another approach
            # is to leave these out of the model entirely.
            return numpy.array([0.0 for val in song_list])
        else:
            return numpy.array([d.get(val, 0.0) for val in song_list])

    def _compute_betas(self, remove_tags_when_bad_regression=True):
        subsets_of_sources = self._compute_subsets_of_sources()
        for (source_list, tag_list) in subsets_of_sources.iteritems():
            self._progress("Computing betas for source subset = %s." % source_list, newline=False)
            if self.regmodel=="Random":
                self._random_betas(tag_list)
            else:
                if "Hierarchical" in self.regtype:
                    self._mcmc_betas_same_sources(tag_list)
                elif "Independent" in self.regtype:
                    self._independent_betas_same_sources(tag_list, remove_tags_when_bad_regression)
                elif self.regtype=="Random Guess":
                    self._random_betas(tag_list)
                elif self.regtype in ["Product","Sum","Min","Max","Median"]:
                    self._NA_betas(tag_list)
                else:
                    raise ValueError("Bad regtype = %s." % self.regtype)
        if self.force_betas_nonneg_except_scrobble_or_if_interactions:
            for (tag, source_dict) in self.beta.items():
                for (source, stat_dict) in source_dict.items():
                    if source != "scrobble" and stat_dict["beta"] < 0:
                        # Remove the p-values and other stats; just make beta 0.
                        self.beta[tag][source] = {"beta":0}
                        # TODO: Replace with nonnegative regression.

    def _compute_subsets_of_sources(self):
        subsets = dict()
        for (tag, sorted_source_list) in self.sorted_sources.iteritems():
            subsets.setdefault(str(sorted_source_list), []).append(tag)
        return subsets

    def _mcmc_betas_same_sources(self, tag_list):
        """
        The given tag_list contains tags that all have the same features
        available. Train on the tags in tag_list using only the songs
        in self.only_these_songs, or all available songs if
        self.only_these_songs is None.
        """
        if not self.production_run:
            self.mcmc_reps = 75 # save time
        rc.library("bayesm")
        data = []
        for tag in tag_list:
            data.append(rc.list(X=self.X[tag],y=self.y[tag]))
        rpy.set_default_mode(rpy.NO_CONVERSION) # Turn off conversion so that lm returns Robj.
        data = rc.list(*data)
        if self.regtype in ["Hierarchical Linear", "Hierarchical Mixture"]:
            Data = rc.list(regdata=data)
        elif self.regtype=="Hierarchical Logistic":
            Data = rc.list(lgtdata=data)
        if self.regtype=="Hierarchical Mixture":
            Prior = rc.list(ncomp=self.ncomp)
        Mcmc=rc.list(R=self.mcmc_reps)
        rpy.set_default_mode(rpy.BASIC_CONVERSION)
        try:
            if self.regtype=="Hierarchical Linear":
                output = rc.rhierLinearModel(Data=Data,Mcmc=Mcmc)
            elif self.regtype=="Hierarchical Logistic":
                output = rc.rhierBinLogit(Data=Data,Mcmc=Mcmc)
            elif self.regtype=="Hierarchical Mixture":
                output = rc.rhierLinearMixture(Data=Data,Prior=Prior,Mcmc=Mcmc)
        except:
            #pdb.set_trace()
            self._info_about_r_error(tag_list)
            return
        beta_matrix = output['betadraw'].mean(axis=2) # nregressions x ncoeffs, averaged along third dim
        matrix_index = 0
        for tag in tag_list:
            cur_tag_beta_vec = beta_matrix[matrix_index,:]
            beta_dict_list = [dict([('beta', coeff)]) for coeff in cur_tag_beta_vec]
            self.beta[tag] = dict(zip(self.sorted_sources[tag],beta_dict_list))
            self.stats[tag] = dict() # I'm not currently storing any stats for hierarchical regressions.
            matrix_index += 1

    def _info_about_r_error(self, tag_list):
        """
        Print info when rhier... functions raise a sorting-related error.
        Delete all tags we're working with.
        """
        util.info("\tERROR: Problem with R's sorting thingymajig. %s" % str(sys.exc_info()))
        for tag in tag_list:
            self._remove_tag(tag)

    def _independent_betas_same_sources(self, tag_list, remove_tags_when_bad_regression, n_times_show_summary=3):
        times_showed_summary = 0 # This allows us to print out some summary statistics without producing an overwhelming amount of output.
        SUMMARY_STATS = ["beta", "stderr", "tstat", "pval"]
        for tag in tag_list:
            self._progress("Computing betas for tag %s." % tag, newline=True) # rmme: newline make false
            rpy.set_default_mode(rpy.NO_CONVERSION) # Turn off conversion so that lm returns Robj.
            data = rc.list(y=self.y[tag],X=self.X[tag])
            model = "y~X-1" # Use -1 because X has an intercept already
            if self.regtype=="Independent Linear":
                try:
                    result = rc.lm(model,data=data)
                except:
                    pdb.set_trace()
            elif self.regtype=="Independent Logistic":
                result = rc.glm(model,family=rc.binomial("logit"),data=data)
            rpy.set_default_mode(rpy.BASIC_CONVERSION) # Return to normal conversion mode.
            summary = rc.summary(result,correlation=rc.TRUE)
            self._record_regression_stats(tag, summary)
            beta_dict = dict()
            sorted_sources = self.sorted_sources[tag]
            coeff_matrix = summary["coefficients"]
            for i in range(len(sorted_sources)):
                try:
                    cur_source_dict = dict(zip(SUMMARY_STATS,coeff_matrix[i,:]))
                except IndexError:
                    util.info("\tWARNING: Regression for %s didn't end up using all variables." % tag)
                    if remove_tags_when_bad_regression:
                        self._remove_tag(tag)
                        break # break from for-loop over sorted_sources; we don't continue out of the per-tag for loop until later when we check if tag is in self.features....
                    continue
                try:
                    cur_source_dict["-log10(pval)"] = -log(cur_source_dict["pval"], 10)
                except OverflowError:
                    pass
                beta_dict[sorted_sources[i]] = cur_source_dict
            if tag not in self.features: # We've removed this tag a few lines above, so skip it.
                continue
            self.beta[tag] = beta_dict
            if times_showed_summary < n_times_show_summary:
                self._print_regression_summary(tag, summary)
                times_showed_summary += 1
            #predictions = rc.predict(result)
            #predictions = [predictions['%d' % i] for i in range(1,len(predictions)+1)]

    def _random_betas(self, tag_list):
        """
        Betas should be just 0.
        """
        for tag in tag_list:
            beta_dict = dict()
            for source in self.sorted_sources[tag]:
                beta_dict[source] = dict([("beta",0)])
            self.beta[tag] = beta_dict
            self.stats[tag] = dict()

    def _NA_betas(self, tag_list):
        """
        Just fill in the beta_dict so that we can use it to iterate through tags later.
        """
        for tag in tag_list:
            beta_dict = dict()
            for source in self.sorted_sources[tag]:
                beta_dict[source] = dict([("beta","NA because regtype is %s." % self.regtype)])
            self.beta[tag] = beta_dict
            self.stats[tag] = dict()

    def _record_regression_stats(self, tag, summary):
        self.stats[tag] = dict()
        if self.regtype=="Independent Linear":
            self.stats[tag][r"$R^2$"] = summary['r.squared']
            self.stats[tag][r"$R^2_\text{adj}$"] = summary["adj.r.squared"]
            self.stats[tag][r"$\widehat{\sigma}$"] = summary["sigma"]
            try:
                self.stats[tag][r"$-log_10(\text{p-val})"] = -log(1-rc.pf(summary['fstatistic']['value'],summary['fstatistic']['numdf'],summary['fstatistic']['dendf']), 10)  # For why this formula works, see, e.g., http://cran.r-project.org/doc/contrib/Faraway-PRA.pdf (page 30)
            except OverflowError:
                pass # p-value is really really small!
        elif self.regtype=="Independent Logistic":
            self.stats[tag]["Deviance"] = summary["deviance"]  # The deviance is twice the log-likelihood ratio statistic
            self.stats[tag]["Null deviance"] = summary["null.deviance"]
            self.stats[tag]["AIC"] = summary["aic"]

    def _print_regression_summary(self, tag, summary):
        self._progress("Cur tag = %s." % tag)
        if self.regtype=="Independent Linear":
            self._progress("R^2 = %.3f, R^2-adjusted = %.3f, estim err sigma = %.2f." % (summary['r.squared'], summary["adj.r.squared"], summary["sigma"]))
        self._progress("Feature correlation matrix:\n%s" % str(numpy.corrcoef(numpy.transpose(self.X[tag]))))
        self._progress("Beta correlation matrix:\n%s" % str(summary['correlation']))

    def _compute_yhat(self):
        self._progress("Computing yhat.")
        for (tag, X_test) in self.X.iteritems():
            X_test_sources = self.sorted_sources[tag]
            n_songs = len(X_test)
            yhat = numpy.zeros((1,n_songs))
            if self.regtype in ["Product","Sum","Min","Max","Median"]:
                if self.add_intercept_as_first_feature:
                    assert X_test[0][0]==1, "Failed quick-and-dirty test for intercept."
                    X_test = X_test[:,1:] # eliminate intercept
                feature_probs = 1/(1+numpy.exp(-X_test)) # Convert positive and negative scores to "probabilities".
                for i in range(numpy.size(yhat,axis=1)):
                    yhat[0,i] = self._combine_probs(feature_probs[i,:])
            else:
                for (source, stats_dict) in self.beta[tag].items():
                    try:
                        index_in_X_test = X_test_sources.index(source)
                        yhat += stats_dict['beta'] * X_test[:,index_in_X_test]
                    except ValueError:
                        continue
                if self.regtype=="Independent Logistic" or self.regtype=="Hierarchical Logistic":
                    yhat = 1/(1+numpy.exp(-yhat))
            self.yhat[tag] = yhat

    def _combine_probs(self, feature_probs):
        """
        feature_probs is an array of "probabilities" to be combined.
        """
        if self.regtype=="Product":
            return numpy.exp(numpy.sum(numpy.log(feature_probs)))
        elif self.regtype=="Sum":
            return numpy.sum(feature_probs)
        elif self.regtype=="Max":
            return max(feature_probs)
        elif self.regtype=="Min":
            return min(feature_probs)
        elif self.regtype=="Median":
            return numpy.median(feature_probs)
        else:
            raise ValueError("You shouldn't be calling this function with regtype=%s." % self.regtype)

    def _performance_stats(self, remove_tags_dont_work=True):
        self._progress("Computing performance stats.")
        for (tag, true_y) in self.y.items():
            if tag not in self.stats:
                pdb.set_trace()
            yhat = self.yhat[tag].transpose() # make n_songs x 1
            self.stats[tag]["Num Songs"] = len(yhat)
            # SSE
            self.stats[tag][r'$\text{SSE} / n$'] = numpy.sum(numpy.power((true_y-yhat),2)) / len(true_y)
            # precision, recall, etc.
            sorted_yhat = sorted([(yhat[i,0], i) for i in range(len(yhat))], reverse=True)
            graded = [self._in_ground_truth(true_y[i,0]) for (yhat_val, i) in sorted_yhat]
            try:
                self.stats[tag]["Baseline"] = self._random_precision(graded)
                self.stats[tag]["AUC"] = self._areaUnderCurve(graded)
                self.stats[tag]["MAP"] = self._avgPrecision(graded)
                self.stats[tag]["R-Prec"] = self._rPrecision(graded)
                self.stats[tag]["10-Prec"] = self._tenPrecision(graded)
                baseline = self.stats[tag]["Baseline"]
                if baseline > 0:
                    self.stats[tag]["MAP/Baseline"] = self.stats[tag]["MAP"] / baseline
                    self.stats[tag]["R-Prec/Baseline"] = self.stats[tag]["R-Prec"] / baseline
                    self.stats[tag]["10-Prec/Baseline"] = self.stats[tag]["10-Prec"] / baseline
            except ValueError:
                util.info("WARNING: TP==0 or FP==0 for tag = %s." % tag)
                if remove_tags_dont_work:
                    self._remove_tag(tag)
                    continue
            # Record best and worst songs.
            song_list = list(self.song_lists[tag])
            self.best_worst_songs[tag] = dict()
            index_best_song = sorted_yhat[0][1]
            self.best_worst_songs[tag]["Best Song"] = (self.songid_to_song[song_list[index_best_song]], 1 if true_y[index_best_song,0] else 0)
            index_worst_song = sorted_yhat[-1][1]
            self.best_worst_songs[tag]["Worst Song"] = (self.songid_to_song[song_list[index_worst_song]], 1 if true_y[index_worst_song,0] else 0)

    def _in_ground_truth(self, y_val):
        return y_val > 0

    def _point_precision(self, retrieved_docs):
	"""
	Return the precision when we retrieve the documents whose correctness is
	given by a True or False value in the list retrieved_docs.
	"""
	# Sum of True is 1, False is 0.
        try:
            return sum(retrieved_docs) / len(retrieved_docs)
        except ZeroDivisionError:
            raise ValueError("No retrieved docs.")

    def _random_precision(self, graded):
        self._progress("Computing baseline precision.", newline=False)
        try:
            return sum(graded) / len(graded)
        except ZeroDivisionError:
            raise ValueError("No songs for this tag.")        
	    
    def _avgPrecision(self, graded):
	"""
	Given the list, graded, of True/False values in the order in which we
	ranked the documents, return our average precision.
	"""
        self._progress("Computing avg precision.", newline=False)
        TP = 0
        P = 0 # total num called positive
        running_sum_for_avg_precision = 0
        for item in graded:
            P += 1
            if item:
                TP += 1
                running_sum_for_avg_precision += TP / P
        return running_sum_for_avg_precision / TP
            
    def _rPrecision(self, graded):
	"""
	Given the list, graded, of True/False values in the order in which we
	ranked the documents, return R-precision.
	"""	
        self._progress("Computing R-precision.", newline=False)        
	R = sum(graded) # Num relevant.
	return self._point_precision(graded[:R])
    
    def _tenPrecision(self, graded):
	"""
	Given the list, graded, of True/False values in the order in which we
	ranked the documents, return the 10-precision.
	"""
        self._progress("Computing 10-precision.", newline=False)        
	n_return = min(len(graded), 10)
	return self._point_precision(graded[:n_return])
    
    def _areaUnderCurve(self, graded):
	"""
	Given the list, graded, of True/False values in the order in which we
	ranked the documents, return the AUC.
	"""
        self._progress("Computing AUC.", newline=False)
	sum = 0
	TP = 0
	FP = 0
	for boolVal in graded:
	    if boolVal:
		TP += 1
	    else:
		FP += 1
		sum += TP
	if TP == 0 or FP == 0:
	    raise ValueError("AUC undefined.")
        return sum / (TP * FP)

    def _write_final(self, outfile):
        self._progress("Writing interpolated results to %s." % outfile)
        if self.overwrite_final_tab_file:
            file = open(outfile, "w")
        else:
            file = open(outfile, "a")
        for (tag, true_y) in self.y.iteritems():
            song_score_pairs = []
            interpolated = numpy.maximum(true_y, self.yhat[tag].transpose())
            song_list = list(self.song_lists[tag])
            for i in range(len(interpolated)):
                song_score_pairs.append((song_list[i], interpolated[i,0]))
            file.write(tag)
            for (song, score) in sorted(song_score_pairs, key=lambda (song, score): score, reverse=True):
                file.write("\t%d\t%f" % (song, score))
            file.write("\n")
        file.close()

def main(a_few_tags=False):
    if a_few_tags:
        A_FEW_TAGS = ['contemporary gospel']#,'yellow','a blues song form','country influences',"post rock","1970s"]
        Combiner(only_these_tags=set(A_FEW_TAGS), production_run=True).fill_in_zeros()
    else:
        # We run out of memory trying to do all tags at once, so just do 2000 at a time.
        N_TAGS_PER_ROUND = 2000
        PRODUCTION_RUN = True
        combiner = Combiner(production_run=PRODUCTION_RUN)
        all_tags = list(combiner.only_these_tags)
        tag_groups = util.partition(all_tags, N_TAGS_PER_ROUND)
        overwrite_final_tab_file = True
        for group in tag_groups:
            group_combiner = Combiner(production_run=PRODUCTION_RUN, only_these_tags=set(group), overwrite_final_tab_file=overwrite_final_tab_file)
            overwrite_final_tab_file = False # From now on, we'll just append to the current one.
            group_combiner.fill_in_zeros()

if __name__ == "__main__":
    main()