From 76ae4339aa13e849c6e577aa9738c345f14e8f10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Paukert?= <paukeluk@fit.cvut.cz>
Date: Thu, 9 Apr 2020 03:01:08 +0200
Subject: [PATCH] Improved speed of computing cosine similarity

---
 querying/src/calculation/Computor.cpp |  9 ++-------
 querying/src/database/Database.cpp    | 18 +++++++++++++++---
 querying/src/database/Database.h      |  7 +++++++
 weight_calculation/src/main.cpp       | 12 ++++++++++--
 4 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/querying/src/calculation/Computor.cpp b/querying/src/calculation/Computor.cpp
index 87fd290..714b0f3 100644
--- a/querying/src/calculation/Computor.cpp
+++ b/querying/src/calculation/Computor.cpp
@@ -15,7 +15,7 @@ Computor::Computor(Space space, Query query)
 
 vector<pair<int, double>> Computor::compute(Database & database) {
     vector<pair<int, double>> results;
-    map<string, double> computedDocument;
+    map<int, double> vectorSizes = database.getVectorSizes();
 
     availableTerms = query.termsKeyset;
 
@@ -26,8 +26,6 @@ vector<pair<int, double>> Computor::compute(Database & database) {
     while (!availableTerms.empty()) {
         int ID = nextID(); //get lowest ID
         double result = 0, denominator = 0;
-        // hrozne zpomaluje beh programu (v radu sekund!!) --> vymyslet jine reseni...
-        computedDocument = space.getTermsAndWeightsByID(database, ID);
 
         for (const auto &term: availableTerms) /*Go through all the remaining terms*/ {
             try {
@@ -43,10 +41,7 @@ vector<pair<int, double>> Computor::compute(Database & database) {
             }
         }
 
-        for (const auto & entry : computedDocument)
-            denominator += entry.second * entry.second;
-        
-        denominator = sqrt(denominator * vectorQuerySize);
+        denominator = sqrt(vectorSizes[ID] * vectorQuerySize);
         // Input should not be zero vector but if it is, do not divide and "just" return wrong result..
         if (denominator != 0)
             result = result / denominator;
diff --git a/querying/src/database/Database.cpp b/querying/src/database/Database.cpp
index 6aa7615..26887a4 100644
--- a/querying/src/database/Database.cpp
+++ b/querying/src/database/Database.cpp
@@ -11,7 +11,7 @@ vector<Document> Database::getDocumentsCollection() {
     vector<Document> result;
 
     while (query.executeStep())
-        result.emplace_back(query.getColumn("id"), query.getColumn("filename"));
+        result.emplace_back(query.getColumn(0), query.getColumn(1));
 
     return result;
 }
@@ -21,7 +21,7 @@ Document Database::getDocumentByID(int id) {
     query.bind(":document_id", id);
     query.executeStep();
 
-    return {id, query.getColumn("filename")};
+    return {id, query.getColumn(0)};
 }
 
 vector<string> Database::getTermsByDocumentID(int document_id) {
@@ -32,7 +32,19 @@ vector<string> Database::getTermsByDocumentID(int document_id) {
     query.bind(":id", document_id);
 
     while(query.executeStep())
-        terms.emplace_back(query.getColumn("value"));
+        terms.emplace_back(query.getColumn(0));
 
     return terms;
 }
+
+map<int, double> Database::getVectorSizes() {
+    map<int, double> vectorSizes;
+    SQLite::Statement query(db, "SELECT Document_id, weight FROM TermDocumentOccurrence");
+
+    while(query.executeStep()) {
+        double tmp = query.getColumn(1);
+        vectorSizes[query.getColumn(0)] += tmp * tmp;
+    }
+
+    return vectorSizes;
+}
diff --git a/querying/src/database/Database.h b/querying/src/database/Database.h
index cfb1d94..8e759c6 100644
--- a/querying/src/database/Database.h
+++ b/querying/src/database/Database.h
@@ -2,6 +2,7 @@
 
 #include <SQLiteCpp/SQLiteCpp.h>
 #include <vector>
+#include <map>
 #include <string>
 
 #include "Document.h"
@@ -35,4 +36,10 @@ public:
      * @return Vector with strings which are in specified document
      */
     std::vector<std::string> getTermsByDocumentID(int document_id);
+
+    /**
+     * @brief Computes size of vector for every document in database
+     * @return Map with document_id as key and size of vector as value
+     */
+    std::map<int, double> getVectorSizes();
 };
diff --git a/weight_calculation/src/main.cpp b/weight_calculation/src/main.cpp
index 973b6c0..49a7d2b 100644
--- a/weight_calculation/src/main.cpp
+++ b/weight_calculation/src/main.cpp
@@ -17,7 +17,7 @@ bool calculateWeight(SQLite::Database & db, std::ofstream & ostream, const json
 
     try
     {
-        SQLite::Statement query(db, "SELECT TermDocumentOccurrence.Document_id, TermDocumentOccurrence.count FROM TermDocumentOccurrence "
+        SQLite::Statement query(db, "SELECT TermDocumentOccurrence.Document_id, TermDocumentOccurrence.count, TermDocumentOccurrence.Term_id FROM TermDocumentOccurrence "
                                     "JOIN Term ON TermDocumentOccurrence.Term_id = Term.id "
                                     "WHERE Term.value = :term "
                                     "ORDER BY TermDocumentOccurrence.Document_id ASC");
@@ -26,8 +26,16 @@ bool calculateWeight(SQLite::Database & db, std::ofstream & ostream, const json
         ostream << "\"" << term << "\":{";
         while(query.executeStep())
         {
+            int document_id = query.getColumn("Document_id").getInt();
             weight = query.getColumn("count").getInt() / (occurrences*1.0);
-            ostream << "\"" << query.getColumn("Document_id") << "\":" << std::setprecision(20) << weight << ",";
+            ostream << "\"" << document_id << "\":" << std::setprecision(20) << weight << ",";
+
+            SQLite::Statement update(db, "UPDATE TermDocumentOccurrence SET weight = :weight "
+                                         "WHERE Document_id = :document_id AND Term_id = :term_id");
+            update.bind(":weight", weight);
+            update.bind(":document_id", document_id);
+            update.bind(":term_id", query.getColumn("Term_id").getInt());
+            update.exec();
         }
 
         ostream.seekp(-1, std::ios_base::end);
-- 
GitLab