Commit a2e879ae authored by Haj Rezvan's avatar Haj Rezvan

First commit.

parents
Pipeline #6125 canceled with stages
import json
def union_two(a, b):
(m, n) = (len(a), len(b))
i = j = 0
# Destination Array
d = []
# Merge from a and b together
while i < m and j < n:
if a[i] <= b[j]:
d.append(a[i])
i += 1
else:
d.append(b[j])
j += 1
# Merge from a if b has run out
while i < m:
d.append(a[i])
i += 1
# Merge from b if a has run out
while j < n:
d.append(b[j])
j += 1
return d
def normalize_list(lst):
if type(lst) == dict:
return list(lst.values())
elif type(lst) == list:
return lst
def union(inp):
n = len(inp)
vl = None
vl = normalize_list(inp)
while n >= 2:
if n == 2:
vl = union_two(vl[0], vl[1])
elif n > 2:
tmp = list()
tmp.append(vl[n - 1])
vl = union(vl[0: n - 2])
tmp.append(vl)
vl = tmp
n = n - 1
return vl
def intersect(lst1, lst2):
return list(set(lst1) & set(lst2))
def operations(inp):
flag = []
if "AND" in inp:
flag.append("&")
elif "NOT" in inp:
flag.append("!")
return flag
# Get dictionary of docIDs and return the title of most relevant.
def get_info(inp):
result = list()
if type(inp) == dict:
key_list = inp.keys()
for word in key_list:
for docID in inp.get(word):
result.append(docID)
elif type(inp) == list:
result = inp
# Reading docs from docs dictionary.
out = list()
for docID in result:
file = open(f"./docs/{docID}.json", "r", encoding="utf-8")
cnt = json.load(file)
cnt = dict(cnt)
val = cnt.values()
out.append(list(val))
file.close()
return out
def write_logs(string):
file = open(f"./logs/log.txt", "a", encoding="utf-8")
file.write(string)
file.close()
def enter(it):
write_logs(it)
spl = list(it.split(" "))
file = open("./index/ii.json", "r", encoding="utf-8")
index = json.load(file)
dictionary = dict(index)
rs = []
for word in spl:
if word in dictionary.keys():
rs.append(word)
ld = dict()
for i in range(len(rs)):
ld[rs[i]] = index.get(rs[i])
opt = list()
if len(rs) > 1:
flag = operations(spl)
while len(flag) > 0:
if "&" in flag:
_and = spl.index("AND")
opt.append(intersect(ld[spl[_and + 1]], ld[spl[_and - 1]]))
spl.pop(_and)
flag = operations(spl)
elif "!" in flag:
_not = spl.index("NOT")
spl.pop(_not)
flag = operations(spl)
pass
out_data = get_info(ld)
return out_data
# Flask project
from flask import Flask, render_template, request
import QP
app = Flask(__name__)
def query(inp):
return QP.enter(inp)
@app.route('/', methods=['GET', 'POST'])
def index():
if request.method == 'POST':
data = query(request.form['query'])
return render_template("index.html", data=data)
else:
return render_template("index.html")
import os
os.chdir("./document tokens")
def function():
for file in os.listdir():
file_name = os.path.join("./document tokens", file)
os.chdir("..")
rfl = open(file_name, "r+", encoding="utf-8")
string = rfl.read()
string = string.replace("\"\"\"", "\"\"")
string = string.replace('\xad', '')
string = string.replace('\u00ad', '')
string = string.replace('\N{SOFT HYPHEN}', '')
wfl = open(file_name, "w+", encoding="utf-8")
wfl.write(string)
print(f"File {file} debugged.")
function()
import json
import os
def index():
index_file = open(f"./index/ii.json", "w", encoding="utf-8")
invert_index = dict()
os.chdir("./document tokens")
for tk in os.listdir():
token_file = open(f"./{tk}", "r", encoding="utf-8")
tkn = json.load(token_file)
tkn = dict(tkn)
tk = tk.replace(".json", "")
tk = int(tk)
for key in tkn.keys():
if key not in invert_index.keys():
invert_index[str(key)] = [tk]
else:
indexes = invert_index[key]
indexes.append(tk)
indexes.sort()
token_file.close()
# str_out = str(invert_index)
# str_out = str_out.replace("\'", "\"")
# str_out = dict(str_out)
json.dump(invert_index, index_file)
print("Invert index made!")
import os
import index_maker
import split_document
import tokenizer
from tests import SE as t
def __file_finder():
os.chdir("./docs")
return os.listdir()
if __name__ == '__main__':
print("بسم الله الرحمن الرحیم")
split_document.file_open()
t.split()
files = __file_finder()
os.chdir("..")
tokenizer.get_file(files)
t.token(files)
os.chdir("..")
index_maker.index()
import json
from tqdm import tqdm
def file_open_test():
file = open("data/test_data.json")
__processor(file)
def file_open():
file = open("data/IR_data_news_12k.json")
__processor(file)
def __writer(doc_id, title, content, url):
file = open(f"docs/{doc_id}.json", "w")
dictionary = {
"doc_id": doc_id,
"title": title,
"content": content,
"url": url
}
json.dump(dictionary, file)
def __object_clear(lst):
lst.clear()
def __processor(file):
data = json.load(file)
for i in tqdm(range(14000), desc="Splatted: "):
obj_cnt = ''
lst = list()
try:
for characters in data[f"{i}"]["title"]:
lst.append(characters)
obj_cnt = ''.join(lst)
title = obj_cnt
__object_clear(lst)
for characters in data[f"{i}"]["content"]:
lst.append(characters)
obj_cnt = ''.join(lst)
content = obj_cnt
__object_clear(lst)
for characters in data[f"{i}"]["url"]:
lst.append(characters)
obj_cnt = ''.join(lst)
url = obj_cnt
__object_clear(lst)
__writer(i, title, content, url)
except Exception as ignore:
pass
.content {
text-align: center;
max-width: 60px;
height: 18px;
overflow: hidden;
text-overflow: ellipsis;
font-family: 'B Koodak', monospace;
color: black;
word-break: break-all;
word-wrap: break-word;
}
.ellipsis {
text-align: center;
max-width: 40px;
text-overflow: ellipsis;
overflow: hidden;
font-family: 'B Koodak', monospace;
color: black;
}
\ No newline at end of file
<!DOCTYPE html>
<html lang="en" xmlns="http://www.w3.org/1999/html">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<!-- Bootstrap CSS -->
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet"
integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous">
<link href="../static/file.css" rel="stylesheet">
<title>موتور جست و جوی فارس</title>
</head>
<body>
<h1>
موتور جست و جوی فارس
</h1>
<form action="{{ url_for('index') }}" method="POST">
<h4>
عبارت خود را برای جست و جو وارد کنید
</h4>
<label>
<input type="text" name="query"/>
</label><br>
<input type="submit" name="submit" value="جست و جو" class="mt-1">
</form>
<table id="results" class="table table-striped table-hover ms-1 me-1">
<thead class="table-light">
<tr>
<th scope="row" style="font-family: 'B Titr',monospace">عنوان</th>
<th scope="row" style="font-family: 'B Titr',monospace">گزیده</th>
<th scope="row" style="font-family: 'B Titr',monospace">آدرس</th>
</tr>
</thead>
<tbody>
{% for row in data %}
<tr class="text-light">
<td class="ellipsis">{{ row[1] }}</td>
<td class="content">{{ row[2] }}</td>
<td class="ellipsis">{{ row[3] }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</body>
</html>
\ No newline at end of file
import json
import os
from parsivar import Normalizer, Tokenizer
def debugger(string):
string = string.replace("\'", "\"")
string = string.replace("\"\"\"", "\"\"")
string = string.replace("\\", " ")
return string
def token_maker(filename):
filepath = os.path.join("./docs", filename)
file = open(filepath, "r")
obj = json.load(file)
obj_cnt = ''
lst = list()
for ch in obj["content"]:
lst.append(ch)
obj_cnt = ''.join(lst)
normalizer = Normalizer()
pars_tokenizer = Tokenizer()
normal_txt = normalizer.normalize(obj_cnt)
tokens = pars_tokenizer.tokenize_words(normal_txt)
output = dict()
index = 0
for word in tokens:
if word not in output.keys():
output[str(word)] = [index]
else:
indexes = output[word]
indexes.append(index)
index = index + 1
filename = str(filename)
tk_fl = open(f"document tokens/{filename}", "w", encoding="utf-8")
str_out = str(output)
str_out = debugger(str_out)
tk_fl.write(str_out)
def get_file_test(files):
counter = 0
for f in files:
token_maker(str(f))
print(f"{int((counter / 23) * 100)}% is tokenized!")
counter = counter + 1
def get_file(files):
counter = 0
for f in files:
token_maker(str(f))
print(f"{int((counter / 12201) * 100)}% is tokenized!")
counter = counter + 1
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment