First commit.

a2e879ae · Haj Rezvan · a2e879ae · a2e879ae · a2e879ae · a2e879ae
Commit a2e879ae authored May 14, 2022 by Haj Rezvan
9 changed files
--- a/QP.py
+++ b/QP.py
+import json
+def union_two(a, b):
+    (m, n) = (len(a), len(b))
+    i = j = 0
+    # Destination Array
+    d = []
+    # Merge from a and b together
+    while i < m and j < n:
+        if a[i] <= b[j]:
+            d.append(a[i])
+            i += 1
+        else:
+            d.append(b[j])
+            j += 1
+    # Merge from a if b has run out
+    while i < m:
+        d.append(a[i])
+        i += 1
+    # Merge from b if a has run out
+    while j < n:
+        d.append(b[j])
+        j += 1
+    return d
+def normalize_list(lst):
+    if type(lst) == dict:
+        return list(lst.values())
+    elif type(lst) == list:
+        return lst
+def union(inp):
+    n = len(inp)
+    vl = None
+    vl = normalize_list(inp)
+    while n >= 2:
+        if n == 2:
+            vl = union_two(vl[0], vl[1])
+        elif n > 2:
+            tmp = list()
+            tmp.append(vl[n - 1])
+            vl = union(vl[0: n - 2])
+            tmp.append(vl)
+            vl = tmp
+        n = n - 1
+    return vl
+def intersect(lst1, lst2):
+    return list(set(lst1) & set(lst2))
+def operations(inp):
+    flag = []
+    if "AND" in inp:
+        flag.append("&")
+    elif "NOT" in inp:
+        flag.append("!")
+    return flag
+# Get dictionary of docIDs and return the title of most relevant.
+def get_info(inp):
+    result = list()
+    if type(inp) == dict:
+        key_list = inp.keys()
+        for word in key_list:
+            for docID in inp.get(word):
+                result.append(docID)
+    elif type(inp) == list:
+        result = inp
+    # Reading docs from docs dictionary.
+    out = list()
+    for docID in result:
+        file = open(f"./docs/{docID}.json", "r", encoding="utf-8")
+        cnt = json.load(file)
+        cnt = dict(cnt)
+        val = cnt.values()
+        out.append(list(val))
+        file.close()
+    return out
+def write_logs(string):
+    file = open(f"./logs/log.txt", "a", encoding="utf-8")
+    file.write(string)
+    file.close()
+def enter(it):
+    write_logs(it)
+    spl = list(it.split(" "))
+    file = open("./index/ii.json", "r", encoding="utf-8")
+    index = json.load(file)
+    dictionary = dict(index)
+    rs = []
+    for word in spl:
+        if word in dictionary.keys():
+            rs.append(word)
+    ld = dict()
+    for i in range(len(rs)):
+        ld[rs[i]] = index.get(rs[i])
+    opt = list()
+    if len(rs) > 1:
+        flag = operations(spl)
+        while len(flag) > 0:
+            if "&" in flag:
+                _and = spl.index("AND")
+                opt.append(intersect(ld[spl[_and + 1]], ld[spl[_and - 1]]))
+                spl.pop(_and)
+                flag = operations(spl)
+            elif "!" in flag:
+                _not = spl.index("NOT")
+                spl.pop(_not)
+                flag = operations(spl)
+                pass
+    out_data = get_info(ld)
+    return out_data
--- a/app.py
+++ b/app.py
+# Flask project
+from flask import Flask, render_template, request
+import QP
+app = Flask(__name__)
+def query(inp):
+    return QP.enter(inp)
+@app.route('/', methods=['GET', 'POST'])
+def index():
+    if request.method == 'POST':
+        data = query(request.form['query'])
+        return render_template("index.html", data=data)
+    else:
+        return render_template("index.html")
--- a/debugger.py
+++ b/debugger.py
+import os
+os.chdir("./document tokens")
+def function():
+    for file in os.listdir():
+        file_name = os.path.join("./document tokens", file)
+        os.chdir("..")
+        rfl = open(file_name, "r+", encoding="utf-8")
+        string = rfl.read()
+        string = string.replace("\"\"\"", "\"\"")
+        string = string.replace('\xad', '')
+        string = string.replace('\u00ad', '')
+        string = string.replace('\N{SOFT HYPHEN}', '')
+        wfl = open(file_name, "w+", encoding="utf-8")
+        wfl.write(string)
+        print(f"File {file} debugged.")
+function()
--- a/index_maker.py
+++ b/index_maker.py
+import json
+import os
+def index():
+    index_file = open(f"./index/ii.json", "w", encoding="utf-8")
+    invert_index = dict()
+    os.chdir("./document tokens")
+    for tk in os.listdir():
+        token_file = open(f"./{tk}", "r", encoding="utf-8")
+        tkn = json.load(token_file)
+        tkn = dict(tkn)
+        tk = tk.replace(".json", "")
+        tk = int(tk)
+        for key in tkn.keys():
+            if key not in invert_index.keys():
+                invert_index[str(key)] = [tk]
+            else:
+                indexes = invert_index[key]
+                indexes.append(tk)
+                indexes.sort()
+        token_file.close()
+    # str_out = str(invert_index)
+    # str_out = str_out.replace("\'", "\"")
+    # str_out = dict(str_out)
+    json.dump(invert_index, index_file)
+    print("Invert index made!")
--- a/main.py
+++ b/main.py
+import os
+import index_maker
+import split_document
+import tokenizer
+from tests import SE as t
+def __file_finder():
+    os.chdir("./docs")
+    return os.listdir()
+if __name__ == '__main__':
+    print("بسم الله الرحمن الرحیم")
+    split_document.file_open()
+    t.split()
+    files = __file_finder()
+    os.chdir("..")
+    tokenizer.get_file(files)
+    t.token(files)
+    os.chdir("..")
+    index_maker.index()
--- a/split_document.py
+++ b/split_document.py
+import json
+from tqdm import tqdm
+def file_open_test():
+    file = open("data/test_data.json")
+    __processor(file)
+def file_open():
+    file = open("data/IR_data_news_12k.json")
+    __processor(file)
+def __writer(doc_id, title, content, url):
+    file = open(f"docs/{doc_id}.json", "w")
+    dictionary = {
+        "doc_id": doc_id,
+        "title": title,
+        "content": content,
+        "url": url
+    }
+    json.dump(dictionary, file)
+def __object_clear(lst):
+    lst.clear()
+def __processor(file):
+    data = json.load(file)
+    for i in tqdm(range(14000), desc="Splatted: "):
+        obj_cnt = ''
+        lst = list()
+        try:
+            for characters in data[f"{i}"]["title"]:
+                lst.append(characters)
+                obj_cnt = ''.join(lst)
+            title = obj_cnt
+            __object_clear(lst)
+            for characters in data[f"{i}"]["content"]:
+                lst.append(characters)
+                obj_cnt = ''.join(lst)
+            content = obj_cnt
+            __object_clear(lst)
+            for characters in data[f"{i}"]["url"]:
+                lst.append(characters)
+                obj_cnt = ''.join(lst)
+            url = obj_cnt
+            __object_clear(lst)
+            __writer(i, title, content, url)
+        except Exception as ignore:
+            pass
--- a/static/file.css
+++ b/static/file.css
+.content {
+    text-align: center;
+    max-width: 60px;
+    height: 18px;
+    overflow: hidden;
+    text-overflow: ellipsis;
+    font-family: 'B Koodak', monospace;
+    color: black;
+    word-break: break-all;
+    word-wrap: break-word;
+}
+.ellipsis {
+    text-align: center;
+    max-width: 40px;
+    text-overflow: ellipsis;
+    overflow: hidden;
+    font-family: 'B Koodak', monospace;
+    color: black;
+}
\ No newline at end of file
--- a/templates/index.html
+++ b/templates/index.html
+<!DOCTYPE html>
+<html lang="en" xmlns="http://www.w3.org/1999/html">
+<head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <!-- Bootstrap CSS -->
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet"
+          integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous">
+    <link href="../static/file.css" rel="stylesheet">
+    <title>موتور جست و جوی فارس</title>
+</head>
+<body>
+<h1>
+    موتور جست و جوی فارس
+</h1>
+<form action="{{ url_for('index') }}" method="POST">
+    <h4>
+        عبارت خود را برای جست و جو وارد کنید
+    </h4>
+    <label>
+        <input type="text" name="query"/>
+    </label><br>
+    <input type="submit" name="submit" value="جست و جو" class="mt-1">
+</form>
+<table id="results" class="table table-striped table-hover ms-1 me-1">
+    <thead class="table-light">
+    <tr>
+        <th scope="row" style="font-family: 'B Titr',monospace">عنوان</th>
+        <th scope="row" style="font-family: 'B Titr',monospace">گزیده</th>
+        <th scope="row" style="font-family: 'B Titr',monospace">آدرس</th>
+    </tr>
+    </thead>
+    <tbody>
+    {% for row in data %}
+    <tr class="text-light">
+        <td class="ellipsis">{{ row[1] }}</td>
+        <td class="content">{{ row[2] }}</td>
+        <td class="ellipsis">{{ row[3] }}</td>
+    </tr>
+    {% endfor %}
+    </tbody>
+</table>
+</body>
+</html>
\ No newline at end of file
--- a/tokenizer.py
+++ b/tokenizer.py
+import json
+import os
+from parsivar import Normalizer, Tokenizer
+def debugger(string):
+    string = string.replace("\'", "\"")
+    string = string.replace("\"\"\"", "\"\"")
+    string = string.replace("\\", " ")
+    return string
+def token_maker(filename):
+    filepath = os.path.join("./docs", filename)
+    file = open(filepath, "r")
+    obj = json.load(file)
+    obj_cnt = ''
+    lst = list()
+    for ch in obj["content"]:
+        lst.append(ch)
+        obj_cnt = ''.join(lst)
+    normalizer = Normalizer()
+    pars_tokenizer = Tokenizer()
+    normal_txt = normalizer.normalize(obj_cnt)
+    tokens = pars_tokenizer.tokenize_words(normal_txt)
+    output = dict()
+    index = 0
+    for word in tokens:
+        if word not in output.keys():
+            output[str(word)] = [index]
+        else:
+            indexes = output[word]
+            indexes.append(index)
+        index = index + 1
+    filename = str(filename)
+    tk_fl = open(f"document tokens/{filename}", "w", encoding="utf-8")
+    str_out = str(output)
+    str_out = debugger(str_out)
+    tk_fl.write(str_out)
+def get_file_test(files):
+    counter = 0
+    for f in files:
+        token_maker(str(f))
+        print(f"{int((counter / 23) * 100)}% is tokenized!")
+        counter = counter + 1
+def get_file(files):
+    counter = 0
+    for f in files:
+        token_maker(str(f))
+        print(f"{int((counter / 12201) * 100)}% is tokenized!")
+        counter = counter + 1