Skip to content
Snippets Groups Projects
Commit 21903e90 authored by Lukáš Paukert's avatar Lukáš Paukert
Browse files

WIP: Implemented querying with files, files similarity calculation fixed

parent f0d0d83a
No related branches found
No related tags found
No related merge requests found
......@@ -50,6 +50,7 @@ set(SOURCES
src/calculation/WeightedDocument.cpp src/calculation/WeightedDocument.h
src/util/QueryJSONParser.cpp src/util/QueryJSONParser.h
src/exceptions/Exceptions.h
src/database/Terms.cpp src/database/Terms.h
src/database/DocumentCollection.cpp src/database/DocumentCollection.h src/util/ArgumentParser.cpp src/util/ArgumentParser.h src/database/Document.cpp src/database/Document.h)
 
# User interface files
......
......@@ -7,8 +7,9 @@ body {
}
 
button {
font-size: 20px;
margin: 20px;
font-size: 25px;
margin: 25px;
padding: 5px;
}
 
.navigation {
......
#!/bin/bash
 
# Exit on first error
set -e
# Build
./build.sh
 
# Create dir build
mkdir -p build
cd build
 
cmake -DCMAKE_BUILD_TYPE=Debug ..
# Build
cmake --build .
# Launch
./Querying --docroot ./../resources --http-address 0.0.0.0 --http-port 3999
#include <iostream>
#include <climits>
#include <utility>
#include <algorithm>
#include <cmath>
#include "Computor.h"
#include "../exceptions/Exceptions.h"
 
......@@ -10,14 +13,15 @@ Computor::Computor(Space space, Query query)
: space(std::move(space)),
query(std::move(query)) {}
 
map<int, double> Computor::compute() {
map<int, double> results;
vector<pair<int, double>> Computor::compute(Terms & collection, int document_id) {
vector<pair<int, double>> results;
map<string, double> currentDocument = space.getTermsAndWeightsByID(collection, document_id);
 
availableTerms = query.termsKeyset;
 
while (!availableTerms.empty()) {
int ID = nextID(); //get lowest ID
double result = 0;
double result = 0, denominator = 0, tmp = 0;
 
for (const auto &term: availableTerms) /*Go through all the remaining terms*/ {
try {
......@@ -32,10 +36,25 @@ map<int, double> Computor::compute() {
availableTerms.erase(term); //exhaust term
}
}
for (const auto & entry : query.terms)
denominator += entry.second * entry.second;
for (const auto & entry : currentDocument)
tmp += entry.second * entry.second;
denominator = sqrt(denominator * tmp);
// Input should not be zero vector but if it is, do not divide and "just" return wrong result..
if (denominator != 0)
result = result / denominator;
if (result > query.threshold) //filter out irrelevant documents
results[ID] = result;
results.emplace_back(make_pair(ID, result));
}
 
sort(results.begin(), results.end(), [] (const pair<int, double> & a, const pair<int, double> & b)
{ return a.second > b.second; });
return results;
}
 
......@@ -47,5 +66,3 @@ int Computor::nextID() {
 
return lowestID;
}
......@@ -5,9 +5,11 @@
#ifndef QUERYING_COMPUTOR_H
#define QUERYING_COMPUTOR_H
 
#include <map>
#include <vector>
#include "Space.h"
#include "Query.h"
#include "./../database/Terms.h"
 
/**
* @brief A class that encompasses the calculation of a document's relevancy to the query
......@@ -38,9 +40,9 @@ public:
* @brief Computes relevancies of documents to the query
*
* Filters relevancies that are below @ref Query::threshold
* @return Map of document IDs and their relevancies to the query
* @return Vector of pairs with document IDs and their similarity to the query, sorted by similarity desc
*/
std::map<int, double> compute();
std::vector<std::pair<int, double>> compute(Terms & collection, int document_id);
};
 
 
......
......@@ -18,4 +18,14 @@ InvertedIndex &Space::getInvertedIndexByKey(const string &key) {
return terms.at(key);
}
 
const map<string, double> Space::getTermsAndWeightsByID(Terms & collection, int document_id) {
map<string, double> terms;
vector<string> dummy = collection.getTermsByDocumentID(document_id);
 
for (string term : dummy) {
InvertedIndex tmp = getInvertedIndexByKey(term);
terms[term] = tmp.getDocumentWeightByID(document_id);
}
return terms;
}
......@@ -8,6 +8,7 @@
 
#include "InvertedIndex.h"
#include "Query.h"
#include "./../database/Terms.h"
 
/**
* @brief A class representing the vector space of the collection
......@@ -34,6 +35,12 @@ public:
*/
InvertedIndex &getInvertedIndexByKey(const std::string &key);
 
/**
* @brief Finds all terms in DB which occurs in specific document
* @param collection, document_id Instance of Terms class with DB connection and document_id to process
* @return Map with terms as keys and weights as their values
*/
const std::map<std::string, double> getTermsAndWeightsByID(Terms & collection, int document_id);
 
/**
* @brief Gets an element from @ref Space::terms with key @ref key
......
#include "Terms.h"
Terms::Terms(const std::string &path)
: db(path) {}
std::vector<std::string> Terms::getTermsByDocumentID(int document_id) {
std::vector<std::string> terms;
SQLite::Statement query(db, "SELECT Term.value FROM Term "
"JOIN TermDocumentOccurrence ON Term.id = TermDocumentOccurrence.Term_id "
"WHERE TermDocumentOccurrence.Document_id = :id");
query.bind(":id", document_id);
while(query.executeStep())
terms.emplace_back(query.getColumn("value"));
return terms;
}
#pragma once
#include <SQLiteCpp/SQLiteCpp.h>
#include <vector>
#include <string>
/**
* A class that handles fetching Terms from the TermDocumentOccurrence table
*/
class Terms {
private:
SQLite::Database db; /**< Database connection */
public:
explicit Terms(const std::string &path);
/**
* @brief Finds all terms from specified document
* @param document_id to process
* @return Vector with strings which are in specified document
*/
std::vector<std::string> getTermsByDocumentID(int document_id);
};
#include <Wt/WContainerWidget.h>
#include <Wt/WText.h>
#include <Wt/WMenu.h>
#include <Wt/WStackedWidget.h>
#include <Wt/WBreak.h>
#include <Wt/WPushButton.h>
 
#include <fstream>
......@@ -14,7 +14,6 @@
#include "./../util/QueryJSONParser.h"
#include "./../util/InvertedIndexJSONParser.h"
#include "./../database/DocumentCollection.h"
#include "./../database/Document.h"
 
#include "MainPage.h"
 
......@@ -41,7 +40,7 @@ MainPage::MainPage(const Wt::WEnvironment& env)
container->addWidget(std::move(buttonPtr));
 
button->clicked().connect([=] {
displayDetail(container, availableDocuments.at(menu->currentIndex()).name);
displayDetail(space, availableDocuments, container, availableDocuments.at(menu->currentIndex()).id);
});
}
 
......@@ -58,44 +57,36 @@ std::string MainPage::getName(const std::string & path)
std::string MainPage::getDocument(const std::string & path)
{
std::string content;
std::ifstream file(path);
content.assign((std::istreambuf_iterator<char>(file)),
(std::istreambuf_iterator<char>() ));
file.close();
 
encode(content);
size_t start_pos = 0;
while((start_pos = content.find('\n', start_pos)) != std::string::npos) {
content.replace(start_pos, 1, "<br/>");
start_pos += 5;
}
return content;
}
 
void MainPage::encode(std::string & data) {
std::string buffer;
buffer.reserve(data.size());
for(size_t pos = 0; pos != data.size(); ++pos) {
switch(data[pos]) {
case '&': buffer.append("&amp;"); break;
case '\"': buffer.append("&quot;"); break;
case '\'': buffer.append("&apos;"); break;
case '<': buffer.append("&lt;"); break;
case '>': buffer.append("&gt;"); break;
default: buffer.append(&data[pos], 1); break;
}
}
data.swap(buffer);
}
void MainPage::displayDetail(Wt::WContainerWidget * container, const std::string & path)
void MainPage::displayDetail(Space space, const std::vector<Document> & availableDocuments, Wt::WContainerWidget * container, int document_id)
{
// deletes everything from current container
container->clear();
// Show similar books
container->addNew<Wt::WText>("<h1>" + getName(path) + "</h1>");
container->addNew<Wt::WText>(getDocument(path));
// udelat to lepe, aby tu nemusely byt cesty napevno..
Terms collection("./../../data/persistence/docs_and_terms.db");
Document document = availableDocuments.at(document_id - 1);
// threshold je nyni nastaven na -1 --> ve vysledku budou i uplne rozdilne dokumenty
Query query(space.getTermsAndWeightsByID(collection, document.id), -1);
auto result = Computor(space, query).compute(collection, document.id);
// dodelat proklikavani na zobrazene podobne dokumenty
for (size_t i = 0; i < 10 && i < result.size(); i++) {
container->addNew<Wt::WText>("Document ID: " + std::to_string(result.at(i).first) + "; relevance: " + std::to_string(result.at(i).second));
container->addNew<Wt::WBreak>();
}
container->addNew<Wt::WText>("<h1>" + getName(document.name) + "</h1>");
auto text = Wt::cpp14::make_unique<Wt::WText>();
text->setTextFormat(Wt::TextFormat::Plain);
text->setText(getDocument(document.name));
container->addWidget(std::move(text));
}
......@@ -3,6 +3,8 @@
#include <Wt/WApplication.h>
#include <string>
 
#include "./../database/Document.h"
class MainPage : public Wt::WApplication
{
public:
......@@ -11,6 +13,5 @@ public:
private:
std::string getName(const std::string & path);
std::string getDocument(const std::string & path);
void encode(std::string & content);
void displayDetail(Wt::WContainerWidget * container, const std::string & path);
void displayDetail(Space space, const std::vector<Document> & availableDocuments, Wt::WContainerWidget * container, int document_id);
};
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment