From 76ae4339aa13e849c6e577aa9738c345f14e8f10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Paukert?= <paukeluk@fit.cvut.cz> Date: Thu, 9 Apr 2020 03:01:08 +0200 Subject: [PATCH] Improved speed of computing cosine similarity --- querying/src/calculation/Computor.cpp | 9 ++------- querying/src/database/Database.cpp | 18 +++++++++++++++--- querying/src/database/Database.h | 7 +++++++ weight_calculation/src/main.cpp | 12 ++++++++++-- 4 files changed, 34 insertions(+), 12 deletions(-) diff --git a/querying/src/calculation/Computor.cpp b/querying/src/calculation/Computor.cpp index 87fd290..714b0f3 100644 --- a/querying/src/calculation/Computor.cpp +++ b/querying/src/calculation/Computor.cpp @@ -15,7 +15,7 @@ Computor::Computor(Space space, Query query) vector<pair<int, double>> Computor::compute(Database & database) { vector<pair<int, double>> results; - map<string, double> computedDocument; + map<int, double> vectorSizes = database.getVectorSizes(); availableTerms = query.termsKeyset; @@ -26,8 +26,6 @@ vector<pair<int, double>> Computor::compute(Database & database) { while (!availableTerms.empty()) { int ID = nextID(); //get lowest ID double result = 0, denominator = 0; - // hrozne zpomaluje beh programu (v radu sekund!!) --> vymyslet jine reseni... - computedDocument = space.getTermsAndWeightsByID(database, ID); for (const auto &term: availableTerms) /*Go through all the remaining terms*/ { try { @@ -43,10 +41,7 @@ vector<pair<int, double>> Computor::compute(Database & database) { } } - for (const auto & entry : computedDocument) - denominator += entry.second * entry.second; - - denominator = sqrt(denominator * vectorQuerySize); + denominator = sqrt(vectorSizes[ID] * vectorQuerySize); // Input should not be zero vector but if it is, do not divide and "just" return wrong result.. if (denominator != 0) result = result / denominator; diff --git a/querying/src/database/Database.cpp b/querying/src/database/Database.cpp index 6aa7615..26887a4 100644 --- a/querying/src/database/Database.cpp +++ b/querying/src/database/Database.cpp @@ -11,7 +11,7 @@ vector<Document> Database::getDocumentsCollection() { vector<Document> result; while (query.executeStep()) - result.emplace_back(query.getColumn("id"), query.getColumn("filename")); + result.emplace_back(query.getColumn(0), query.getColumn(1)); return result; } @@ -21,7 +21,7 @@ Document Database::getDocumentByID(int id) { query.bind(":document_id", id); query.executeStep(); - return {id, query.getColumn("filename")}; + return {id, query.getColumn(0)}; } vector<string> Database::getTermsByDocumentID(int document_id) { @@ -32,7 +32,19 @@ vector<string> Database::getTermsByDocumentID(int document_id) { query.bind(":id", document_id); while(query.executeStep()) - terms.emplace_back(query.getColumn("value")); + terms.emplace_back(query.getColumn(0)); return terms; } + +map<int, double> Database::getVectorSizes() { + map<int, double> vectorSizes; + SQLite::Statement query(db, "SELECT Document_id, weight FROM TermDocumentOccurrence"); + + while(query.executeStep()) { + double tmp = query.getColumn(1); + vectorSizes[query.getColumn(0)] += tmp * tmp; + } + + return vectorSizes; +} diff --git a/querying/src/database/Database.h b/querying/src/database/Database.h index cfb1d94..8e759c6 100644 --- a/querying/src/database/Database.h +++ b/querying/src/database/Database.h @@ -2,6 +2,7 @@ #include <SQLiteCpp/SQLiteCpp.h> #include <vector> +#include <map> #include <string> #include "Document.h" @@ -35,4 +36,10 @@ public: * @return Vector with strings which are in specified document */ std::vector<std::string> getTermsByDocumentID(int document_id); + + /** + * @brief Computes size of vector for every document in database + * @return Map with document_id as key and size of vector as value + */ + std::map<int, double> getVectorSizes(); }; diff --git a/weight_calculation/src/main.cpp b/weight_calculation/src/main.cpp index 973b6c0..49a7d2b 100644 --- a/weight_calculation/src/main.cpp +++ b/weight_calculation/src/main.cpp @@ -17,7 +17,7 @@ bool calculateWeight(SQLite::Database & db, std::ofstream & ostream, const json try { - SQLite::Statement query(db, "SELECT TermDocumentOccurrence.Document_id, TermDocumentOccurrence.count FROM TermDocumentOccurrence " + SQLite::Statement query(db, "SELECT TermDocumentOccurrence.Document_id, TermDocumentOccurrence.count, TermDocumentOccurrence.Term_id FROM TermDocumentOccurrence " "JOIN Term ON TermDocumentOccurrence.Term_id = Term.id " "WHERE Term.value = :term " "ORDER BY TermDocumentOccurrence.Document_id ASC"); @@ -26,8 +26,16 @@ bool calculateWeight(SQLite::Database & db, std::ofstream & ostream, const json ostream << "\"" << term << "\":{"; while(query.executeStep()) { + int document_id = query.getColumn("Document_id").getInt(); weight = query.getColumn("count").getInt() / (occurrences*1.0); - ostream << "\"" << query.getColumn("Document_id") << "\":" << std::setprecision(20) << weight << ","; + ostream << "\"" << document_id << "\":" << std::setprecision(20) << weight << ","; + + SQLite::Statement update(db, "UPDATE TermDocumentOccurrence SET weight = :weight " + "WHERE Document_id = :document_id AND Term_id = :term_id"); + update.bind(":weight", weight); + update.bind(":document_id", document_id); + update.bind(":term_id", query.getColumn("Term_id").getInt()); + update.exec(); } ostream.seekp(-1, std::ios_base::end); -- GitLab