Commit 51d4a6fd authored by Haj Rezvan's avatar Haj Rezvan

Optimized the functions.

parent 88bb2e3d
import json import json
import os import os
import threading
def logs(cnt, i): def logs(cnt, i):
...@@ -14,15 +13,15 @@ def index(): ...@@ -14,15 +13,15 @@ def index():
index_file = open(f"./index/ii.json", "w", encoding="utf-8") index_file = open(f"./index/ii.json", "w", encoding="utf-8")
invert_index = dict() invert_index = dict()
os.chdir("./document tokens") os.chdir("./document tokens")
for tk in os.listdir(): files = os.listdir()
print(tk) counter = 0
pre_percent = 0
print("0% is Index made!")
for tk in files:
token_file = open(f"./{tk}", "r", encoding="utf-8") token_file = open(f"./{tk}", "r", encoding="utf-8")
backup = token_file.readline()
try: try:
tkn = json.load(token_file) tkn = json.load(token_file)
except (json.decoder.JSONDecodeError, Exception) as e: # backup = token_file.readline()
thread = threading.Thread(target=logs, args=(backup, tk,))
thread.start()
tkn = dict(tkn) tkn = dict(tkn)
tk = tk.replace(".json", "") tk = tk.replace(".json", "")
tk = int(tk) tk = int(tk)
...@@ -36,8 +35,13 @@ def index(): ...@@ -36,8 +35,13 @@ def index():
indexes.sort() indexes.sort()
token_file.close() token_file.close()
# str_out = str(invert_index) new_percent = int((counter / len(files)) * 100)
# str_out = str_out.replace("\'", "\"") if new_percent != pre_percent:
# str_out = dict(str_out) print(f"{new_percent}% is Index made!")
pre_percent = new_percent
counter = counter + 1
except Exception as e:
print(f"Exception in file {tk}\n{e.args}\n")
json.dump(invert_index, index_file) json.dump(invert_index, index_file)
print("Invert index made!") print("Invert index made!")
...@@ -38,17 +38,20 @@ def __retrieval(data, i, tag): ...@@ -38,17 +38,20 @@ def __retrieval(data, i, tag):
return obj_cnt return obj_cnt
except Exception as ignore: except Exception as ignore:
print(f"We have a Exception!! {ignore.with_traceback}") print(f"We have a Exception!! {ignore.with_traceback}")
pass
def __processor(file): def __processor(file):
data = json.load(file) data = json.load(file)
length = len(data)
for i in tqdm(range(14000), desc="Splatted: "): doc_ids = list(data.keys())
title = __retrieval(data, i, "title") for i in tqdm(range(length), desc="Splatted: "):
content = __retrieval(data, i, "content") doc_id = doc_ids[i]
url = __retrieval(data, i, "url") title = __retrieval(data, doc_id, "title")
tags = __retrieval(data, i, "tags") content = __retrieval(data, doc_id, "content")
date = __retrieval(data, i, "date") url = __retrieval(data, doc_id, "url")
category = __retrieval(data, i, "category") tags = __retrieval(data, doc_id, "tags")
date = __retrieval(data, doc_id, "date")
__writer(i, title, content, url, tags, date, category) category = __retrieval(data, doc_id, "category")
__writer(doc_id, title, content, url, tags, date, category)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment