#ifndef ALGORITHMS_H #define ALGORITHMS_H #include "Structures.h" #include "util.h" #include #include #include #include template class ProcessorWrapper { public: ProcessorWrapper(vector& entries, bool isProbe) { policy.initialize(entries, isProbe); } void processMovieRatings(int movieId, vector& entries) { policy.processMovieRatings(movieId, entries); } void postProcess(vector& entries) { policy.postProcess(entries); } T policy; }; class WriteBinaryEntries { public: WriteBinaryEntries() { } void initialize(vector& entries, bool isProbe) { } void processMovieRatings(int movieId, vector& entries) { ofstream file((binaryEntriesDir + formatNumber(movieId/1000, 2) + "/" + trainingFilePrefix + formatNumber(movieId, 7) + trainingFileSuffix).c_str()); vector::const_iterator pos; union { char charRep[4]; int intRep; } charIntUnion; for (pos = entries.begin(); pos != entries.end(); ++pos) { charIntUnion.intRep = pos->userId; for (int i = 0; i < 4; i++) { file.put(charIntUnion.charRep[i]); } // The movieId is stored implicitly by the name of the file. file.put(pos->getRating()); /*charIntUnion.intRep = pos->ratingAndMovieId; for (int i = 0; i < 4; i++) { file.put(charIntUnion.charRep[i]); }*/ } entries.clear(); } void postProcess(vector& entries) { } }; class CalculateMovieRatings { public: CalculateMovieRatings() { } void initialize(vector& entries, bool isProbe) { if (isProbe) { file.open("probe/movieAverageRatings.txt"); } else { file.open("movieAverageRatings.txt"); } } void processMovieRatings(int movieId, vector& entries) { vector::const_iterator pos; int sum = 0; for (pos = entries.begin(); pos != entries.end(); ++pos) { sum += pos->getRating(); } double average = ((double) sum)/((double) entries.size()); file << average << "," << entries.size() << "," << calculateStdDev(entries, average) << endl; entries.clear(); } void postProcess(vector& entries) { } ofstream file; }; class CalculateGlobalAverage { public: CalculateGlobalAverage() : sum(0), count(0) { } void initialize(vector& entries, bool isProbe) { } void processMovieRatings(int movieId, vector& entries) { for (vector::const_iterator pos = entries.begin(); pos != entries.end(); ++pos) { count++; sum += pos->getRating(); } entries.clear(); } void postProcess(vector& entries) { cout << "Global average rating is " << ((double) sum)/((double) count) << " out of " << count << " ratings." << endl; } long sum; long count; }; class CalculateUserRatings { public: typedef std::map > UserRatingMap; CalculateUserRatings() { } void initialize(vector& entries, bool isProbe) { if (isProbe) { file.open("probe/userAverageRatings.txt"); } else { file.open("userAverageRatings.txt"); } } void processMovieRatings(int movieId, vector& entries) { for (vector::const_iterator pos = entries.begin(); pos != entries.end(); ++pos) { int userId = pos->getUserId(); UserRatingMap::iterator inst; inst = userAverages.find(userId); if (inst == userAverages.end()) { std::map ratingsToOccurrences; ratingsToOccurrences.insert(std::make_pair(pos->getRating(), 1)); userAverages.insert(std::make_pair(userId, ratingsToOccurrences)); } else { std::map ratingsToOccurrences = inst->second; std::map::iterator ratingsMapIt = ratingsToOccurrences.find(pos->getRating()); if (ratingsMapIt == ratingsToOccurrences.end()) { userAverages[userId][pos->getRating()] = 1; } else { userAverages[userId][pos->getRating()] = ratingsMapIt->second + 1; } } } entries.clear(); } void postProcess(vector& entries) { // Print out the average user ratings. for (UserRatingMap::const_iterator inst = userAverages.begin(); inst != userAverages.end(); ++inst) { int sum = 0; int numEntries = 0; for (std::map::const_iterator ratingsToOccurrences = inst->second.begin(); ratingsToOccurrences != inst->second.end(); ++ratingsToOccurrences) { numEntries += ratingsToOccurrences->second; sum += ratingsToOccurrences->first * ratingsToOccurrences->second; } double averageRating = ((double) sum) / ((double) numEntries); double sumSquareDist = 0.0; for (std::map::const_iterator ratingsToOccurrences = inst->second.begin(); ratingsToOccurrences != inst->second.end(); ++ratingsToOccurrences) { sumSquareDist += ratingsToOccurrences->second * (ratingsToOccurrences->first - averageRating) * (ratingsToOccurrences->first - averageRating); } double stdDev = sqrt(sumSquareDist/((double) numEntries)); file << inst->first << "," << averageRating << "," << numEntries << "," << stdDev << endl; } file.close(); } ofstream file; // Maps userIDs to (rating, number of occurrences) UserRatingMap userAverages; }; static bool lessUserId(const Entry& e1, const Entry& e2) { return (e1.getUserId() < e2.getUserId()); } class CalculateItemSimilarities { public: CalculateItemSimilarities() { } void initialize(vector& entries, bool isProbe) { this->isProbe = isProbe; movieRatings.resize(numMovies + 1); appendEntriesFromUserAverageRatings(isProbe, userAverages); //setEntriesFromMovieAverageRatings(isProbe, movieAverages); } void processMovieRatings(int movieId, vector& entries) { std::copy(entries.begin(), entries.end(), back_inserter(movieRatings[movieId])); entries.clear(); // Need to sort by userId for calculateCosine() to work. std::sort(movieRatings[movieId].begin(), movieRatings[movieId].end(), lessUserId); } double calculateCosine(const vector& vector1, const vector& vector2) { double dotProd = 0; double mag1Squared = 0.0; double mag2Squared = 0.0; int numEntries = 0; vector::const_iterator it1 = vector1.begin(), it2 = vector2.begin(); while (it1 != vector1.end() && it2 != vector2.end()) { const Entry e1 = *it1; const Entry e2 = *it2; if (e1.getUserId() == e2.getUserId()) { double e1Rating = ((double) e1.getRating()) - userAverages[e1.getUserId()].averageRating; double e2Rating = ((double) e2.getRating()) - userAverages[e2.getUserId()].averageRating; dotProd += e1Rating * e2Rating; mag1Squared += e1Rating * e1Rating; mag2Squared += e2Rating * e2Rating; ++it1; ++it2; ++numEntries; } else if (e1.getUserId() < e2.getUserId()) { ++it1; } else { ++it2; } } //cout << "Got " << numEntries << " entries to compute cosine." << endl; if (mag1Squared * mag2Squared > 0.000001) { return ((double) dotProd)/sqrt(mag1Squared*mag2Squared); } else { return 0.0; } } void postProcess(vector& entries) { cout << "Calculating cosines..." << endl; for (int i = 3427; i <= numMovies; i++) { if (i % 250 == 0) { cout << "cosining movie " << i << endl; } string fileToOpen = "movieCorrelations/" + formatNumber(i/1000, 2) + "/" + formatNumber(i, 7) + ".txt"; if (isProbe) { fileToOpen = "probe/" + fileToOpen; } ofstream file(fileToOpen.c_str()); for (int j = 1; j <= numMovies; j++) { double cosine = calculateCosine(movieRatings[i], movieRatings[j]); file << cosine << endl; } } cout << "Done." << endl; } std::vector > movieRatings; std::map userAverages; //std::vector movieAverages; bool isProbe; }; static bool lessMovieId(const Entry& e1, const Entry& e2) { return (e1.getMovieId() < e2.getMovieId()); } static bool greaterSimilarity(const std::pair& p1 ,const std::pair& p2) { return p1.second > p2.second; } class CalculateUserSimilarities { public: CalculateUserSimilarities() { } void initialize(vector& entries, bool isProbe) { this->isProbe = isProbe; setEntriesFromMovieAverageRatings(isProbe, movieAverages); appendEntriesFromUserAverageRatings(isProbe, userAverages); struct timeval tv; struct timezone tz; gettimeofday(&tv, &tz); srand(tv.tv_sec * 37 + tv.tv_usec); } void processMovieRatings(int movieId, vector& entries) { for (vector::const_iterator it = entries.begin(); it != entries.end(); ++it) { int userId = it->getUserId(); std::map >::iterator posToInsert; posToInsert = userRatings.find(userId); if (posToInsert == userRatings.end()) { vector ratings; ratings.reserve(50); ratings.push_back(*it); userRatings.insert(std::make_pair(userId, ratings)); } else { vector ratings = posToInsert->second; int size = ratings.size(); if (ratings.capacity() == size) { // We can't be too aggressive about reserving extra space // since memory is tight, so just allocate an extra few. ratings.reserve(size + 5); } ratings.push_back(*it); userRatings[userId] = ratings; } } entries.clear(); } double calculateCosine(const vector& vector1, const vector& vector2) { double dotProd = 0; double mag1Squared = 0.0; double mag2Squared = 0.0; int numEntries = 0; vector::const_iterator it1 = vector1.begin(), it2 = vector2.begin(); while (it1 != vector1.end() && it2 != vector2.end()) { const Entry e1 = *it1; const Entry e2 = *it2; if (e1.getMovieId() == e2.getMovieId()) { // FFV - correct this somehow? // Correct with user ratings for now. // Well, that didn't work well. Try movie ratings. //double e1Rating = ((double)e1.getRating()) - userAverages[e1.getUserId()].averageRating; //double e2Rating = ((double)e2.getRating()) - userAverages[e2.getUserId()].averageRating; double e1Rating = ((double)e1.getRating()) - movieAverages[e1.getMovieId()].averageRating; double e2Rating = ((double)e2.getRating()) - movieAverages[e2.getMovieId()].averageRating; dotProd += e1Rating * e2Rating; mag1Squared += e1Rating * e1Rating; mag2Squared += e2Rating * e2Rating; ++it1; ++it2; ++numEntries; } else if (e1.getMovieId() < e2.getMovieId()) { ++it1; } else { ++it2; } } //cout << "Got " << numEntries << " entries to compute cosine." << endl; // Need at least 5 entries to care. if (mag1Squared * mag2Squared > 0.000001 && numEntries >= 5) { return ((double) dotProd)/sqrt(mag1Squared*mag2Squared); } else { // no entries with which to compare them. // this is a kludge. return -1000.0; } } void postProcess(vector& entries) { cout << "Calculating cosines..." << endl; // Need to sort by userId for calculateCosine() to work. map >::iterator userIt; for (userIt = userRatings.begin(); userIt != userRatings.end(); ++userIt) { std::sort(userIt->second.begin(), userIt->second.end(), lessMovieId); } int numToSkip = 0; int i = 0; // For resuming purposes - skip some. for (userIt = userRatings.begin(); userIt != userRatings.end() && (i < numToSkip); ++userIt, ++i) { } vector > userSimilarities; for (; userIt != userRatings.end(); ++userIt, ++i) { userSimilarities.clear(); int userId = userIt->first; if (i % 250 == 0) { cout << "cosining user - have done " << i << " (on user " << userId << ")" << endl; } string fileToOpen = "userCorrelations/" + formatNumber(userId%100, 2) + "/" + formatNumber(userId, 7) + ".txt"; if (isProbe) { fileToOpen = "probe/" + fileToOpen; } ofstream file(fileToOpen.c_str()); map >::iterator secondUserIt; // We can't try all 480189 users - that takes too long. Let's // try just 20000 of them (this makes this run in about 24 hours) vector userIndices; int m = 480189; int n = 20000; userSimilarities.reserve(n); userIndices.reserve(n); for (int j = 0; j < n; j++) { /* select m of remaining n-j */ if ((rand() % (n-j)) < m) { userIndices.push_back(j); m--; } } secondUserIt = userRatings.begin(); int lastUserIndex = 0; int globalIndex = 0; for (globalIndex = 0; globalIndex < n; ++globalIndex) { // Advance secondUserIt to the appropriate place. int curUserIndex = userIndices[globalIndex]; int numToAdvance = curUserIndex - lastUserIndex; for (int j = 0; j < numToAdvance; ++secondUserIt, ++j) { } double cosine = calculateCosine(userIt->second, secondUserIt->second); if (cosine != -1000.0) { // TODO - if this works, do this binary style userSimilarities.push_back(std::make_pair(secondUserIt->first, cosine)); } lastUserIndex = curUserIndex; } // Print out the top, say, 500 pairs std::partial_sort(userSimilarities.begin(), userSimilarities.begin() + 500, userSimilarities.end(), greaterSimilarity); for (vector >::const_iterator simIt = userSimilarities.begin(); simIt != userSimilarities.end() && simIt != userSimilarities.begin() + 500; ++simIt) { file << simIt->first << "," << simIt->second << endl; } } cout << "Done." << endl; } std::vector movieAverages; std::map userAverages; std::map > userRatings; bool isProbe; }; #endif