Test for phase 2 of IR project.

d2249df1 · Haj Rezvan · 668cb7e5 · 668cb7e5 · d2249df1 · d2249df1
Commit d2249df1 authored May 23, 2022 by Haj Rezvan
7 changed files
--- a/debugger.py
+++ b/debugger.py
-import os
-
-os.chdir("./document tokens")
-
-
-def function():
-    for file in os.listdir():
-        file_name = os.path.join("./document tokens", file)
-        os.chdir("..")
-        rfl = open(file_name, "r+", encoding="utf-8")
-        string = rfl.read()
-        string = string.replace("\"\"\"", "\"\"")
-        string = string.replace('\xad', '')
-        string = string.replace('\u00ad', '')
-        string = string.replace('\N{SOFT HYPHEN}', '')
-        wfl = open(file_name, "w+", encoding="utf-8")
-        wfl.write(string)
-        print(f"File {file} debugged.")
-
-
-function()
--- a/index_maker.py
+++ b/index_maker.py
 import json

 import os
+import threading
+
+
+def logs(cnt, i):
+    file = open(f".{i} JSONDecodeError.log", 'w', encoding="utf-8")
+    file.write(cnt)
+    file.close()


 def index():
@@ -8,8 +15,14 @@ def index():
    invert_index = dict()
    os.chdir("./document tokens")
    for tk in os.listdir():
+        print(tk)
        token_file = open(f"./{tk}", "r", encoding="utf-8")
+        backup = token_file.readline()
+        try:
            tkn = json.load(token_file)
+        except (json.decoder.JSONDecodeError, Exception) as e:
+            thread = threading.Thread(target=logs, args=(backup, tk,))
+            thread.start()
        tkn = dict(tkn)
        tk = tk.replace(".json", "")
        tk = int(tk)

--- a/main.py
+++ b/main.py
 import os
+import threading

 import index_maker
 import split_document
 import tokenizer
-from tests import SE as t


 def __file_finder():
@@ -11,13 +11,24 @@ def __file_finder():
    return os.listdir()


+def stop_word():
+    file = open("stop words", "r", encoding="utf8")
+    lst = file.readlines()
+    for i in range(len(lst)):
+        word = lst[i].replace("\n", "")
+        lst[i] = word
+    tokenizer.stop_words = lst
+
+
 if __name__ == '__main__':
    print("بسم الله الرحمن الرحیم")
-    split_document.file_open()
-    t.split()
+    thread = threading.Thread(target=stop_word())
+    thread.run()
+    # split_document.file_open()
+    split_document.file_open_test()
    files = __file_finder()
    os.chdir("..")
-    tokenizer.get_file(files)
-    t.token(files)
+    # tokenizer.get_file(files)
+    tokenizer.get_file_test(files)
    os.chdir("..")
    index_maker.index()
--- a/static/search.png
+++ b/static/search.png
--- a/stop words
+++ b/stop words
+و
+در
+به
+از
+كه
+مي
+اين
+است
+را
+با
+هاي
+براي
+آن
+يك
+شود
+شده
+خود
+ها
+كرد
+شد
+اي
+تا
+كند
+بر
+بود
+گفت
+نيز
+وي
+هم
+كنند
+دارد
+ما
+كرده
+يا
+اما
+بايد
+دو
+اند
+هر
+خواهد
+او
+مورد
+آنها
+باشد
+ديگر
+مردم
+نمي
+بين
+پيش
+پس
+اگر
+همه
+صورت
+يكي
+هستند
+بي
+من
+دهد
+هزار
+نيست
+استفاده
+داد
+داشته
+راه
+داشت
+چه
+همچنين
+كردند
+داده
+بوده
+دارند
+همين
+ميليون
+سوي
+شوند
+بيشتر
+بسيار
+روي
+گرفته
+هايي
+تواند
+اول
+نام
+هيچ
+چند
+جديد
+بيش
+شدن
+كردن
+كنيم
+نشان
+حتي
+اينكه
+ولی
+توسط
+چنين
+برخي
+نه
+ديروز
+دوم
+درباره
+بعد
+مختلف
+گيرد
+شما
+گفته
+آنان
+بار
+طور
+گرفت
+دهند
+گذاري
+بسياري
+طي
+بودند
+ميليارد
+بدون
+تمام
+كل
+تر
+براساس
+شدند
+ترين
+امروز
+باشند
+ندارد
+چون
+قابل
+گويد
+ديگري
+همان
+خواهند
+قبل
+آمده
+اكنون
+تحت
+طريق
+گيري
+جاي
+هنوز
+چرا
+البته
+كنيد
+سازي
+سوم
+كنم
+بلكه
+زير
+توانند
+ضمن
+فقط
+بودن
+حق
+آيد
+وقتي
+اش
+يابد
+نخستين
+مقابل
+خدمات
+امسال
+تاكنون
+مانند
+تازه
+آورد
+فكر
+آنچه
+نخست
+نشده
+شايد
+چهار
+جريان
+پنج
+ساخته
+زيرا
+نزديك
+برداري
+كسي
+ريزي
+رفت
+گردد
+مثل
+آمد
+ام
+بهترين
+دانست
+كمتر
+دادن
+تمامي
+جلوگيري
+بيشتري
+ايم
+ناشي
+چيزي
+آنكه
+بالا
+بنابراين
+ايشان
+بعضي
+دادند
+داشتند
+برخوردار
+نخواهد
+هنگام
+نبايد
+غير
+نبود
+ديده
+وگو
+داريم
+چگونه
+بندي
+خواست
+فوق
+ده
+نوعي
+هستيم
+ديگران
+همچنان
+سراسر
+ندارند
+گروهي
+سعي
+روزهاي
+آنجا
+يكديگر
+كردم
+بيست
+بروز
+سپس
+رفته
+آورده
+نمايد
+باشيم
+گويند
+زياد
+خويش
+همواره
+گذاشته
+شش
+نداشته
+شناسي
+خواهيم
+آباد
+داشتن
+نظير
+همچون
+باره
+نكرده
+شان
+سابق
+هفت
+دانند
+جايي
+بی
+جز
+زیرِ
+رویِ
+سریِ
+تویِ
+جلویِ
+پیشِ
+عقبِ
+بالایِ
+خارجِ
+وسطِ
+بیرونِ
+سویِ
+کنارِ
+پاعینِ
+نزدِ
+نزدیکِ
+دنبالِ
+حدودِ
+برابرِ
+طبقِ
+مانندِ
+ضدِّ
+هنگامِ
+برایِ
+مثلِ
+بارة
+اثرِ
+تولِ
+علّتِ
+سمتِ
+عنوانِ
+قصدِ
+روب
+جدا
+کی
+که
+چیست
+هست
+کجا
+کجاست
+کَی
+چطور
+کدام
+آیا
+مگر
+چندین
+یک
+چیزی
+دیگر
+کسی
+بعری
+هیچ
+چیز
+)
+(
+،
+.
+؟
+!
+@
+#
+%
+$
+^
+&
+*
+-
+
+=
+-
+جا
+کس
+هرگز
+یا
+تنها
+بلکه
+خیاه
+بله
+بلی
+آره
+آری
+مرسی
+البتّه
+لطفاً
+ّه
+انکه
+وقتیکه
+همین
+پیش
+مدّتی
+هنگامی
+مان
+تان
+"
\ No newline at end of file
--- a/templates/index.html
+++ b/templates/index.html
@@ -4,7 +4,8 @@
    <meta charset="UTF-8"/>
    <meta http-equiv="X-UA-Compatible" content="IE=edge"/>
    <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
-    <title>Document</title>
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
+    <title>موتور جست و جوی فارس</title>
    <style type="text/css">
        .box {
            width: 50%;
@@ -33,6 +34,7 @@
 <body>
 <div>
    <img src=" {{ url_for('static', filename='amirkabir.png') }}" style="width: 7%" alt=""/>
+    <form method="POST" action="{{ url_for('index') }}">
        <input
                style="
          border-radius: 1rem;
@@ -40,16 +42,23 @@
          height: 2rem;
          outline: none;
          border: 1px solid #8080804a;
-          margin-left: 2rem;
+          margin-left: 150px;
+          margin-top: 20px;
          position: absolute;
          top: 1rem;
          box-shadow: 0 2px 5px 1px rgba(64, 60, 67, 0.16);
          padding: 0 1rem;
        "
                type="text"
-            name=""
-            id=""
+                name="query"
+                id="query"
        />
+        <input type="image" style="
+          position: absolute;
+          margin-left: 910px;
+          margin-top: -108px;
+          " name="submit" src="{{ url_for('static', filename='search.png') }}"/>
+    </form>
    <img
            src="{{ url_for('static', filename='farsnewslogo.png') }}"
            style="width: 17%; position: absolute; right: 2rem; top: 1rem"
@@ -59,12 +68,15 @@

    {% for row in data %}
    <div class="box">
-        <span>{{ row[3] }}</span>
+        <span style="font-size: 15px">{{ row[3] }}</span>
        <br/>
-        <a href={{ row[3] }}>{{ row[1] }}</a>
-        <p>
+        <a href={{ row[3] }} style="font-family: 'B Titr'">{{ row[1] }}</a>
+        <p style="font-family: 'B Nazanin'">
            {{ row[2] }}
        </p>
+        <br/>
+        <br/>
+
    </div>
    {% endfor %}


--- a/tokenizer.py
+++ b/tokenizer.py
 import json
 import os

-from parsivar import Normalizer, Tokenizer
+from parsivar import Normalizer, Tokenizer, FindStems
+
+global stop_words


 def debugger(string):
    string = string.replace("\'", "\"")
    string = string.replace("\"\"\"", "\"\"")
    string = string.replace("\\", " ")
+    string = string.replace("\u200c", " ")
    return string


+def normalize(tokens):
+    if type(tokens) != list:
+        tokens = list(tokens)
+    for word in tokens:
+        if word in stop_words:
+            while word in tokens:
+                tokens.remove(word)
+    return tokens
+
+
 def token_maker(filename):
    filepath = os.path.join("./docs", filename)
    file = open(filepath, "r")
@@ -22,12 +35,15 @@ def token_maker(filename):
        obj_cnt = ''.join(lst)
    normalizer = Normalizer()
    pars_tokenizer = Tokenizer()
+    my_stemmer = FindStems()
    normal_txt = normalizer.normalize(obj_cnt)
    tokens = pars_tokenizer.tokenize_words(normal_txt)
    output = dict()
    index = 0
+    tokens = normalize(tokens)

    for word in tokens:
+        word = my_stemmer.convert_to_stem(word)
        if word not in output.keys():
            output[str(word)] = [index]
        else:
@@ -42,17 +58,22 @@ def token_maker(filename):
    tk_fl.write(str_out)


-def get_file_test(files):
+def __starter(upper_bound, files):
    counter = 0
+    pre_percent = 0
+    print("0% is tokenized!")
    for f in files:
+        new_percent = int((counter / upper_bound) * 100)
        token_maker(str(f))
-        print(f"{int((counter / 23) * 100)}% is tokenized!")
+        if new_percent != pre_percent:
+            print(f"{new_percent}% is tokenized!")
+            pre_percent = new_percent
        counter = counter + 1


+def get_file_test(files):
+    __starter(23, files)
+
+
 def get_file(files):
-    counter = 0
-    for f in files:
-        token_maker(str(f))
-        print(f"{int((counter / 12201) * 100)}% is tokenized!")
-        counter = counter + 1
+    __starter(12201, files)