diff --git a/.gitignore b/.gitignore index 40815772bdf7fa9a7ef6666f9201ebeb26b7aa5f..0cb9c54c6c34b83de2c0c4bb291b020b2d8c225f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # custom data/ .idea/ +.vscode/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..8cc77d71473e630d322e59402c431d771e86f286 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,14 @@ +# Minimum CMake version, project name and project version +cmake_minimum_required(VERSION 3.1) +project(VectorModel VERSION 1.0) + +# C++ 11 compiler is required +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Add subdirectory SQLiteCpp with all necessary files +add_subdirectory(./lib/SQLiteCpp) + +add_executable(main ./src/weight_calculation/main.cpp) + +target_link_libraries(main SQLiteCpp) diff --git a/TODO.md b/TODO.md index 3eb1e7af2b1e91ec8e6b35ff878a1bb5c8615678..9f1ef035168bdaab417a4730d36690a176471efb 100644 --- a/TODO.md +++ b/TODO.md @@ -2,8 +2,8 @@ Aplikace by mÄ›la obsahovat: * [X] Extrakce termĹŻ. * [X] Identifikace nevĂ˝znamovĂ˝ch slov. * [X] Stemming/lematizace. -* [ ] VĂ˝poÄŤet vah termĹŻ. -* [ ] Implementace indexovacĂ struktury. +* [x] VĂ˝poÄŤet vah termĹŻ. +* [x] Implementace indexovacĂ struktury. * [ ] VyhodnocenĂ dotazu oproti indexovacĂ struktuĹ™e. * [ ] WebovĂ˝ interface (zadánĂ dotazu a vizualizace vĂ˝sledku). diff --git a/build.sh b/build.sh new file mode 100755 index 0000000000000000000000000000000000000000..11cca9fcf793a1bf2895b699fb01002875326e05 --- /dev/null +++ b/build.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# Exit on first error +set -e + +# Create dir build +mkdir -p build +cd build + +cmake -DCMAKE_BUILD_TYPE=Debug .. + +# Build +cmake --build . diff --git a/requirements.md b/requirements.md new file mode 100644 index 0000000000000000000000000000000000000000..6f34f8f4009895dc1c2e0697786feb5ecea8ad9c --- /dev/null +++ b/requirements.md @@ -0,0 +1,11 @@ +# PoĹľadavky +- kompilátor podporujĂcĂ minimálnÄ› C++ 11 +- vytvoĹ™it adresář ```lib``` v koĹ™enovĂ©m adresáři projektu pro následujĂcĂ knihovny +- knihovna pro práci s SQLite: + - ke staĹľenĂ [ZDE](https://github.com/SRombauts/SQLiteCpp/releases) + - rozbalit do adresáře ```lib``` a sloĹľku pĹ™ejmenovat na ```SQLiteCpp``` +- knihovna pro práci s JSON soubory: + - ke staĹľenĂ [ZDE](https://github.com/nlohmann/json/releases) + - staÄŤĂ stáhnout hlaviÄŤkovĂ˝ soubor ```json.hpp``` a umĂstit ho do adresáře ```lib``` +- pro sestavenĂ lze vyuĹľĂt skript ```build.sh``` +- binárka se potĂ© nacházĂ v ```build/main```, je nutnĂ© ji spustit z adresáře ```build``` diff --git a/src/preprocessing/database/create-script.sql b/src/preprocessing/database/create-script.sql index a7addcacff316b1beb5ba877e324974960989767..7f2c8f1d0ae88824eba65d32183812148174da60 100644 --- a/src/preprocessing/database/create-script.sql +++ b/src/preprocessing/database/create-script.sql @@ -19,5 +19,3 @@ CREATE TABLE TermDocumentOccurrence FOREIGN KEY (Document_id) REFERENCES Document (id), FOREIGN KEY (Term_id) REFERENCES Term (id) ); - - diff --git a/src/preprocessing/preprocessor.py b/src/preprocessing/preprocessor.py index 4962f38450631b98735132abc7d8e1de8816c4f8..5e59086f50d4b050b7ddaf6439d029b6a5b26b12 100644 --- a/src/preprocessing/preprocessor.py +++ b/src/preprocessing/preprocessor.py @@ -137,7 +137,10 @@ class Preprocessor: document_key = database.last_primary_key() for term in self.terms: database.execute('''INSERT OR IGNORE INTO Term(value) VALUES (?)''', [term]) - term_key = database.last_primary_key() + database.execute('''SELECT id FROM Term WHERE value = ?''', [term]) + term_key = database.fetchone()[0] + if term_key is None: + term_key = database.last_primary_key() database.execute('''INSERT INTO TermDocumentOccurrence(Term_id, Document_id, count) VALUES (?,?,?)''', [term_key, document_key, self.terms[term]]) database.commit() diff --git a/src/weight_calculation/main.cpp b/src/weight_calculation/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7cb6673786c4a54365af1457657f3d6654c38152 --- /dev/null +++ b/src/weight_calculation/main.cpp @@ -0,0 +1,102 @@ +#include <iostream> +#include <iomanip> +#include <fstream> +#include <string> + +#include <SQLiteCpp/SQLiteCpp.h> +#include "../../lib/json.hpp" + +using json = nlohmann::json; + +bool calculateWeight(SQLite::Database & db, std::ofstream & ostream, const json & maxOccurrences, const std::string & term) +{ + uint32_t occurrences; + double weight; + maxOccurrences[term].get_to(occurrences); + + try + { + SQLite::Statement query(db, "SELECT TermDocumentOccurrence.Document_id, TermDocumentOccurrence.count FROM TermDocumentOccurrence " + "JOIN Term ON TermDocumentOccurrence.Term_id = Term.id " + "WHERE Term.value = :term " + "ORDER BY TermDocumentOccurrence.Document_id ASC"); + query.bind(":term", term); + + ostream << "\"" << term << "\":{"; + while(query.executeStep()) + { + weight = query.getColumn("count").getInt() / (occurrences*1.0); + ostream << "\"" << query.getColumn("Document_id") << "\":" << std::setprecision(20) << weight << ","; + } + + ostream.seekp(-1, std::ios_base::end); + ostream << "},"; + } + + catch(const std::exception& e) + { + std::cout << "SQLite exception: " << e.what() << std::endl; + return false; + } + + return true; +} + +bool process(std::ofstream & ostream, const json & maxOccurrences) +{ + try + { + SQLite::Database db("./../data/persistance/db", SQLite::OPEN_READWRITE|SQLite::OPEN_CREATE); + SQLite::Statement query(db, "SELECT value FROM Term"); + + ostream << "{"; + while (query.executeStep()) + if(!calculateWeight(db, ostream, maxOccurrences, query.getColumn("value"))) + return false; + + ostream.seekp(-1, std::ios_base::end); + ostream << "}"; + } + + catch(const std::exception& e) + { + std::cout << "SQLite exception: " << e.what() << std::endl; + return false; + } + + return true; +} + +int main (void) +{ + std::ifstream istream("./../data/persistance/most_frequent_words.json"); + std::ofstream ostream("./../data/persistance/invertedList.json"); + + if (istream.fail() || ostream.fail()) + { + std::cout << "Cannot open/find file 'most_frequent_words.json' or cannot create file 'invertedList.json'" << std::endl; + return EXIT_FAILURE; + } + + json maxOccurrences; + istream >> maxOccurrences; + istream.close(); + + if (istream.fail()) + { + std::cout << "Something went wrong with file 'most_frequent_words.json'" << std::endl; + return EXIT_FAILURE; + } + + if (!process(ostream, maxOccurrences)) + return EXIT_FAILURE; + + ostream.close(); + if (ostream.fail()) + { + std::cout << "Something went wrong during writing to output file 'invertedList.json'" << std::endl; + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +}