Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in
Toggle navigation
F
Farse news IR project
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
9831029
Farse news IR project
Commits
e3f40329
Commit
e3f40329
authored
2 years ago
by
Haj Rezvan
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Optimized the functions.
parent
bc700933
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
143 additions
and
45 deletions
+143
-45
QP.py
QP.py
+116
-31
main.py
main.py
+4
-4
index.html
templates/index.html
+2
-2
tokenizer.py
tokenizer.py
+21
-8
No files found.
QP.py
View file @
e3f40329
import
json
import
os
import
threading
...
...
@@ -71,7 +72,6 @@ def operations(inp):
return
flag
# Get dictionary of docIDs and return the title of most relevant.
def
get_info
(
inp
):
result
=
list
()
if
type
(
inp
)
==
dict
:
...
...
@@ -95,53 +95,138 @@ def get_info(inp):
return
out
def
write_logs
(
string
):
# Get dictionary of docIDs and return the title of most relevant.
def
__write_logs
(
string
):
file
=
open
(
f
"./logs/log.txt"
,
"a"
,
encoding
=
"utf-8"
)
file
.
write
(
string
+
"
\n
"
)
file
.
close
()
def
__intersection
(
i
:
int
,
return_list
:
list
,
selected
:
list
):
if
i
==
0
:
for
doc
in
range
(
0
,
len
(
selected
)):
return_list
.
append
(
selected
[
doc
])
else
:
counter
=
0
while
counter
<
len
(
return_list
):
if
not
selected
.
__contains__
(
return_list
[
counter
]):
return_list
.
remove
(
return_list
[
counter
])
counter
=
counter
-
1
counter
=
counter
+
1
return
return_list
def
__double_quotation
(
i
:
int
,
query
:
list
,
dictionary
:
dict
,
finish
:
bool
,
doc_id
:
list
):
length
=
len
(
query
[
i
])
query
[
i
]
=
query
[
i
][
1
:
length
]
# length of phrase query.
if
dictionary
.
keys
()
.
__contains__
(
query
[
i
]):
while
True
:
# Find end of "
selected
=
list
()
length
=
len
(
query
[
i
+
1
])
if
query
[
i
][
length
-
1
]
==
'"'
:
query
[
i
+
1
]
=
query
[
i
+
1
][
0
:
length
-
1
]
finish
=
True
# Find docID
for
aP
in
range
(
0
,
len
(
dictionary
[
query
[
i
]])):
# document ID
doc
=
list
(
dictionary
[
query
[
i
]]
.
keys
())[
aP
]
# Number of document in dictionary.
if
dictionary
[
query
[
i
+
1
]]
.
keys
()
.
__contains__
(
doc
):
# Array of this word in the query.
presentPointer
=
dictionary
[
query
[
i
]][
doc
]
# Array of next word in query.
nextPointer
=
list
()
for
bP
in
range
(
0
,
len
(
dictionary
[
query
[
i
+
1
]][
doc
])):
# Iterate from end to begin.
nextPointer
.
append
(
dictionary
[
query
[
i
+
1
]][
doc
]
.
__getitem__
(
bP
)
-
1
)
# Position of documents.
for
p
in
dictionary
[
query
[
i
]][
doc
]:
if
nextPointer
.
__contains__
(
dictionary
[
query
[
i
]][
doc
]
.
__getitem__
(
p
)):
selected
.
append
(
doc
)
break
# intersect of documents.
doc_id
=
__intersection
(
i
,
doc_id
,
selected
)
i
=
i
+
1
if
finish
:
return
doc_id
else
:
return
list
()
def
__not
(
i
:
int
,
query
:
list
,
dictionary
:
dict
,
doc_id
:
list
):
global
file_numbers
selected
=
list
()
length
=
len
(
query
[
i
])
query
[
i
]
=
query
[
i
][
1
:
length
]
# length of phrase query.
if
dictionary
.
keys
()
.
__contains__
(
query
[
i
]):
for
term
in
range
(
0
,
file_numbers
):
if
not
dictionary
.
keys
()
.
__contains__
(
query
[
term
]):
selected
.
append
(
term
)
else
:
# Not in dictionary.
for
pointer
in
range
(
0
,
file_numbers
):
selected
.
append
(
pointer
)
doc_id
=
__intersection
(
i
,
doc_id
,
selected
)
return
doc_id
def
__file_number
():
global
file_numbers
os
.
chdir
(
"./docs"
)
file_numbers
=
len
(
os
.
listdir
())
def
__simple_check
(
i
:
int
,
query
:
list
,
dictionary
:
dict
,
doc_id
:
list
):
global
file_numbers
selected
=
list
()
if
dictionary
.
keys
()
.
__contains__
(
query
[
i
]):
for
term
in
range
(
file_numbers
):
if
dictionary
[
query
[
i
]]
.
keys
()
.
__contains__
(
term
):
selected
.
append
(
term
)
doc_id
=
__intersection
(
i
,
doc_id
,
selected
)
else
:
doc_id
=
list
()
return
doc_id
file_numbers
=
0
def
__checker
(
query
:
list
,
dictionary
:
dict
):
finish
=
False
i
=
0
# For getting index of words in dictionary
content
=
list
()
while
i
<
len
(
query
):
if
query
[
i
][
0
]
==
'"'
:
content
=
__double_quotation
(
i
,
query
,
dictionary
,
finish
,
content
)
elif
query
[
i
][
0
]
==
'!'
:
content
=
__not
(
i
,
query
,
dictionary
,
content
)
else
:
content
=
__simple_check
(
i
,
query
,
dictionary
,
content
)
i
=
i
+
1
return
content
def
enter
(
it
):
t1
=
threading
.
Thread
(
target
=
write_logs
,
args
=
(
it
,))
t1
=
threading
.
Thread
(
target
=
__
write_logs
,
args
=
(
it
,))
t1
.
start
()
t2
=
threading
.
Thread
(
target
=
__file_number
,
args
=
())
t2
.
start
()
spl
=
list
(
it
.
split
(
" "
))
file
=
open
(
"./index/ii.json"
,
"r"
,
encoding
=
"utf-8"
)
index
=
json
.
load
(
file
)
dictionary
=
dict
(
index
)
rs
=
[]
for
word
in
spl
:
if
word
in
dictionary
.
keys
():
rs
.
append
(
word
)
rs
=
__checker
(
it
,
dictionary
)
ld
=
dict
()
for
i
in
range
(
len
(
rs
)):
ld
[
rs
[
i
]]
=
index
.
get
(
rs
[
i
])
print
(
ld
[
rs
[
i
]])
ld_copy
=
ld
.
copy
()
opt
=
list
()
if
len
(
rs
)
>
1
:
flag
=
operations
(
spl
)
while
len
(
flag
)
>
0
:
if
"&"
in
flag
:
_and
=
spl
.
index
(
"AND"
)
nxt_word
=
spl
[
_and
+
1
]
prv_word
=
spl
[
_and
-
1
]
opt
.
extend
(
intersect
(
ld
[
nxt_word
],
ld
[
prv_word
]))
spl
.
pop
(
_and
)
ld
.
pop
(
nxt_word
)
ld
.
pop
(
prv_word
)
ld
[
"opt"
]
=
opt
flag
=
operations
(
spl
)
elif
"!"
in
flag
:
_not
=
spl
.
index
(
"NOT"
)
nxt_word
=
spl
[
_not
+
1
]
prv_word
=
spl
[
_not
-
1
]
opt
=
subtract
(
ld
[
prv_word
],
ld
[
nxt_word
])
print
(
opt
)
spl
.
pop
(
_not
)
flag
=
operations
(
spl
)
out_data
=
get_info
(
ld
)
t1
.
join
()
return
out_data
This diff is collapsed.
Click to expand it.
main.py
View file @
e3f40329
...
...
@@ -25,15 +25,15 @@ if __name__ == '__main__':
thread
=
threading
.
Thread
(
target
=
stop_word
())
thread
.
run
()
split_document
.
file_open
()
# Main Splitter for all of news.
#
split_document.file_open() # Main Splitter for all of news.
#
split_document.file_open_test() # Splitter for test dataset.
split_document
.
file_open_test
()
# Splitter for test dataset.
files
=
__file_finder
()
os
.
chdir
(
".."
)
tokenizer
.
get_file
(
files
)
# Main tokenizer.
#
tokenizer.get_file(files) # Main tokenizer.
#
tokenizer.get_file_test(files) # Tokenizer in test dataset.
tokenizer
.
get_file_test
(
files
)
# Tokenizer in test dataset.
index_maker
.
index
()
This diff is collapsed.
Click to expand it.
templates/index.html
View file @
e3f40329
...
...
@@ -68,9 +68,9 @@
{% for row in data %}
<div
class=
"box"
>
<span
style=
"font-size: 15px"
>
{{ row[
3
] }}
</span>
<span
style=
"font-size: 15px"
>
{{ row[
5
] }}
</span>
<br/>
<a
href=
{{
row
[
3
]
}}
style=
"font-family: 'B Titr'"
>
{{ row[1] }}
</a>
<a
href=
{{
row
[
5
]
}}
style=
"font-family: 'B Titr'"
>
{{ row[1] }}
</a>
<p
style=
"font-family: 'B Nazanin'"
>
{{ row[2] }}
</p>
...
...
This diff is collapsed.
Click to expand it.
tokenizer.py
View file @
e3f40329
...
...
@@ -8,7 +8,7 @@ global stop_words
def
debugger
(
string
):
string
=
string
.
replace
(
"
\'
"
,
"
\"
"
)
string
=
string
.
replace
(
"
\"\"\"
"
,
"
\"\"
"
)
#
string = string.replace("\"\"\"", "\"\"")
string
=
string
.
replace
(
"
\\
"
,
" "
)
string
=
string
.
replace
(
"
\u200c
"
,
" "
)
return
string
...
...
@@ -24,7 +24,8 @@ def normalize(tokens):
return
tokens
def
token_maker
(
filename
):
def
token_maker
(
filename
:
str
):
counter
=
int
(
filename
.
replace
(
".json"
,
""
))
filepath
=
os
.
path
.
join
(
"./docs"
,
filename
)
file
=
open
(
filepath
,
"r"
)
obj
=
json
.
load
(
file
)
...
...
@@ -39,17 +40,29 @@ def token_maker(filename):
normal_txt
=
normalizer
.
normalize
(
obj_cnt
)
tokens
=
pars_tokenizer
.
tokenize_words
(
normal_txt
)
output
=
dict
()
index
=
0
position
=
0
tokens
=
normalize
(
tokens
)
# word = tokens[position]
for
word
in
tokens
:
word
=
my_stemmer
.
convert_to_stem
(
word
)
if
word
not
in
output
.
keys
():
output
[
str
(
word
)]
=
[
index
]
if
word
.
__contains__
(
'&'
):
x
=
word
.
index
(
'&'
)
word
=
word
[
0
:
x
]
if
not
output
.
keys
()
.
__contains__
(
word
):
# Create Postings list
output
.
update
(
{
str
(
word
):
{
str
(
counter
):
[
position
]}
}
)
else
:
indexes
=
output
[
word
]
indexes
.
append
(
index
)
index
=
index
+
1
if
output
[
word
]
.
keys
()
.
__contains__
(
position
):
output
[
word
][
str
(
counter
)]
.
append
(
position
)
else
:
output
[
word
]
.
update
(
{
str
(
counter
):
[
position
]}
)
position
=
position
+
1
filename
=
str
(
filename
)
tk_fl
=
open
(
f
"document tokens/{filename}"
,
"w"
,
encoding
=
"utf-8"
)
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment