Commit e3f40329 authored by Haj Rezvan's avatar Haj Rezvan

Optimized the functions.

parent bc700933
import json
import os
import threading
......@@ -71,7 +72,6 @@ def operations(inp):
return flag
# Get dictionary of docIDs and return the title of most relevant.
def get_info(inp):
result = list()
if type(inp) == dict:
......@@ -95,53 +95,138 @@ def get_info(inp):
return out
def write_logs(string):
# Get dictionary of docIDs and return the title of most relevant.
def __write_logs(string):
file = open(f"./logs/log.txt", "a", encoding="utf-8")
file.write(string + "\n")
file.close()
def __intersection(i: int, return_list: list, selected: list):
if i == 0:
for doc in range(0, len(selected)):
return_list.append(selected[doc])
else:
counter = 0
while counter < len(return_list):
if not selected.__contains__(return_list[counter]):
return_list.remove(return_list[counter])
counter = counter - 1
counter = counter + 1
return return_list
def __double_quotation(i: int, query: list, dictionary: dict, finish: bool, doc_id: list):
length = len(query[i])
query[i] = query[i][1:length] # length of phrase query.
if dictionary.keys().__contains__(query[i]):
while True: # Find end of "
selected = list()
length = len(query[i + 1])
if query[i][length - 1] == '"':
query[i + 1] = query[i + 1][0:length - 1]
finish = True
# Find docID
for aP in range(0, len(dictionary[query[i]])):
# document ID
doc = list(dictionary[query[i]].keys())[aP] # Number of document in dictionary.
if dictionary[query[i + 1]].keys().__contains__(doc):
# Array of this word in the query.
presentPointer = dictionary[query[i]][doc]
# Array of next word in query.
nextPointer = list()
for bP in range(0, len(dictionary[query[i + 1]][doc])): # Iterate from end to begin.
nextPointer.append(dictionary[query[i + 1]][doc].__getitem__(bP) - 1)
# Position of documents.
for p in dictionary[query[i]][doc]:
if nextPointer.__contains__(dictionary[query[i]][doc].__getitem__(p)):
selected.append(doc)
break
# intersect of documents.
doc_id = __intersection(i, doc_id, selected)
i = i + 1
if finish:
return doc_id
else:
return list()
def __not(i: int, query: list, dictionary: dict, doc_id: list):
global file_numbers
selected = list()
length = len(query[i])
query[i] = query[i][1:length] # length of phrase query.
if dictionary.keys().__contains__(query[i]):
for term in range(0, file_numbers):
if not dictionary.keys().__contains__(query[term]):
selected.append(term)
else: # Not in dictionary.
for pointer in range(0, file_numbers):
selected.append(pointer)
doc_id = __intersection(i, doc_id, selected)
return doc_id
def __file_number():
global file_numbers
os.chdir("./docs")
file_numbers = len(os.listdir())
def __simple_check(i: int, query: list, dictionary: dict, doc_id: list):
global file_numbers
selected = list()
if dictionary.keys().__contains__(query[i]):
for term in range(file_numbers):
if dictionary[query[i]].keys().__contains__(term):
selected.append(term)
doc_id = __intersection(i, doc_id, selected)
else:
doc_id = list()
return doc_id
file_numbers = 0
def __checker(query: list, dictionary: dict):
finish = False
i = 0 # For getting index of words in dictionary
content = list()
while i < len(query):
if query[i][0] == '"':
content = __double_quotation(i, query, dictionary, finish, content)
elif query[i][0] == '!':
content = __not(i, query, dictionary, content)
else:
content = __simple_check(i, query, dictionary, content)
i = i + 1
return content
def enter(it):
t1 = threading.Thread(target=write_logs, args=(it,))
t1 = threading.Thread(target=__write_logs, args=(it,))
t1.start()
t2 = threading.Thread(target=__file_number, args=())
t2.start()
spl = list(it.split(" "))
file = open("./index/ii.json", "r", encoding="utf-8")
index = json.load(file)
dictionary = dict(index)
rs = []
for word in spl:
if word in dictionary.keys():
rs.append(word)
rs = __checker(it, dictionary)
ld = dict()
for i in range(len(rs)):
ld[rs[i]] = index.get(rs[i])
print(ld[rs[i]])
ld_copy = ld.copy()
opt = list()
if len(rs) > 1:
flag = operations(spl)
while len(flag) > 0:
if "&" in flag:
_and = spl.index("AND")
nxt_word = spl[_and + 1]
prv_word = spl[_and - 1]
opt.extend(intersect(ld[nxt_word], ld[prv_word]))
spl.pop(_and)
ld.pop(nxt_word)
ld.pop(prv_word)
ld["opt"] = opt
flag = operations(spl)
elif "!" in flag:
_not = spl.index("NOT")
nxt_word = spl[_not + 1]
prv_word = spl[_not - 1]
opt = subtract(ld[prv_word], ld[nxt_word])
print(opt)
spl.pop(_not)
flag = operations(spl)
out_data = get_info(ld)
t1.join()
return out_data
......@@ -25,15 +25,15 @@ if __name__ == '__main__':
thread = threading.Thread(target=stop_word())
thread.run()
split_document.file_open() # Main Splitter for all of news.
# split_document.file_open() # Main Splitter for all of news.
# split_document.file_open_test() # Splitter for test dataset.
split_document.file_open_test() # Splitter for test dataset.
files = __file_finder()
os.chdir("..")
tokenizer.get_file(files) # Main tokenizer.
# tokenizer.get_file(files) # Main tokenizer.
# tokenizer.get_file_test(files) # Tokenizer in test dataset.
tokenizer.get_file_test(files) # Tokenizer in test dataset.
index_maker.index()
......@@ -68,9 +68,9 @@
{% for row in data %}
<div class="box">
<span style="font-size: 15px">{{ row[3] }}</span>
<span style="font-size: 15px">{{ row[5] }}</span>
<br/>
<a href={{ row[3] }} style="font-family: 'B Titr'">{{ row[1] }}</a>
<a href={{ row[5] }} style="font-family: 'B Titr'">{{ row[1] }}</a>
<p style="font-family: 'B Nazanin'">
{{ row[2] }}
</p>
......
......@@ -8,7 +8,7 @@ global stop_words
def debugger(string):
string = string.replace("\'", "\"")
string = string.replace("\"\"\"", "\"\"")
# string = string.replace("\"\"\"", "\"\"")
string = string.replace("\\", " ")
string = string.replace("\u200c", " ")
return string
......@@ -24,7 +24,8 @@ def normalize(tokens):
return tokens
def token_maker(filename):
def token_maker(filename: str):
counter = int(filename.replace(".json", ""))
filepath = os.path.join("./docs", filename)
file = open(filepath, "r")
obj = json.load(file)
......@@ -39,17 +40,29 @@ def token_maker(filename):
normal_txt = normalizer.normalize(obj_cnt)
tokens = pars_tokenizer.tokenize_words(normal_txt)
output = dict()
index = 0
position = 0
tokens = normalize(tokens)
# word = tokens[position]
for word in tokens:
word = my_stemmer.convert_to_stem(word)
if word not in output.keys():
output[str(word)] = [index]
if word.__contains__('&'):
x = word.index('&')
word = word[0:x]
if not output.keys().__contains__(word): # Create Postings list
output.update(
{str(word): {str(counter): [position]}
}
)
else:
indexes = output[word]
indexes.append(index)
index = index + 1
if output[word].keys().__contains__(position):
output[word][str(counter)].append(position)
else:
output[word].update(
{str(counter): [position]}
)
position = position + 1
filename = str(filename)
tk_fl = open(f"document tokens/{filename}", "w", encoding="utf-8")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment