Added qcond (Query Condenser) v0.1.0 library - https://github.com/fuzeman/QueryCondenser
This commit is contained in:
42
libs/qcond/__init__.py
Normal file
42
libs/qcond/__init__.py
Normal file
@@ -0,0 +1,42 @@
|
||||
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
from qcond.transformers.merge import MergeTransformer
|
||||
from qcond.transformers.slice import SliceTransformer
|
||||
from qcond.transformers.strip_common import StripCommonTransformer
|
||||
|
||||
|
||||
__version_info__ = ('0', '1', '0')
|
||||
__version_branch__ = 'master'
|
||||
|
||||
__version__ = "%s%s" % (
|
||||
'.'.join(__version_info__),
|
||||
'-' + __version_branch__ if __version_branch__ else ''
|
||||
)
|
||||
|
||||
|
||||
class QueryCondenser(object):
|
||||
def __init__(self):
|
||||
self.transformers = [
|
||||
MergeTransformer(),
|
||||
SliceTransformer(),
|
||||
StripCommonTransformer()
|
||||
]
|
||||
|
||||
def distinct(self, titles):
|
||||
for transformer in self.transformers:
|
||||
titles = transformer.run(titles)
|
||||
|
||||
return titles
|
||||
23
libs/qcond/compat.py
Normal file
23
libs/qcond/compat.py
Normal file
@@ -0,0 +1,23 @@
|
||||
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import sys
|
||||
|
||||
PY3 = sys.version_info[0] == 3
|
||||
|
||||
if PY3:
|
||||
xrange = range
|
||||
else:
|
||||
xrange = xrange
|
||||
84
libs/qcond/helpers.py
Normal file
84
libs/qcond/helpers.py
Normal file
@@ -0,0 +1,84 @@
|
||||
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
from difflib import SequenceMatcher
|
||||
import re
|
||||
import sys
|
||||
from logr import Logr
|
||||
from qcond.compat import xrange
|
||||
|
||||
|
||||
PY3 = sys.version_info[0] == 3
|
||||
|
||||
|
||||
def simplify(s):
|
||||
s = s.lower()
|
||||
s = re.sub(r"(\w)'(\w)", r"\1\2", s)
|
||||
return s
|
||||
|
||||
|
||||
def strip(s):
|
||||
return re.sub(r"^(\W*)(.*?)(\W*)$", r"\2", s)
|
||||
|
||||
|
||||
def create_matcher(a, b, swap_longest = True, case_sensitive = False):
|
||||
# Ensure longest string is a
|
||||
if swap_longest and len(b) > len(a):
|
||||
a_ = a
|
||||
a = b
|
||||
b = a_
|
||||
|
||||
if not case_sensitive:
|
||||
a = a.upper()
|
||||
b = b.upper()
|
||||
|
||||
return SequenceMatcher(None, a, b)
|
||||
|
||||
|
||||
def first(function_or_none, sequence):
|
||||
if PY3:
|
||||
for item in filter(function_or_none, sequence):
|
||||
return item
|
||||
else:
|
||||
result = filter(function_or_none, sequence)
|
||||
if len(result):
|
||||
return result[0]
|
||||
|
||||
return None
|
||||
|
||||
def sorted_append(sequence, item, func):
|
||||
if not len(sequence):
|
||||
sequence.insert(0, item)
|
||||
return
|
||||
|
||||
x = 0
|
||||
for x in xrange(len(sequence)):
|
||||
if func(sequence[x]):
|
||||
sequence.insert(x, item)
|
||||
return
|
||||
|
||||
sequence.append(item)
|
||||
|
||||
def itemsMatch(L1, L2):
|
||||
return len(L1) == len(L2) and sorted(L1) == sorted(L2)
|
||||
|
||||
def distinct(sequence):
|
||||
result = []
|
||||
|
||||
for item in sequence:
|
||||
if item not in result:
|
||||
result.append(item)
|
||||
|
||||
return result
|
||||
0
libs/qcond/transformers/__init__.py
Normal file
0
libs/qcond/transformers/__init__.py
Normal file
21
libs/qcond/transformers/base.py
Normal file
21
libs/qcond/transformers/base.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
class Transformer(object):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def run(self, titles):
|
||||
raise NotImplementedError()
|
||||
238
libs/qcond/transformers/merge.py
Normal file
238
libs/qcond/transformers/merge.py
Normal file
@@ -0,0 +1,238 @@
|
||||
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
from operator import itemgetter
|
||||
from logr import Logr
|
||||
from qcond.helpers import simplify, strip, first, sorted_append, distinct
|
||||
from qcond.transformers.base import Transformer
|
||||
from qcond.compat import xrange
|
||||
|
||||
|
||||
class MergeTransformer(Transformer):
|
||||
def __init__(self):
|
||||
super(MergeTransformer, self).__init__()
|
||||
|
||||
def run(self, titles):
|
||||
titles = distinct([simplify(title) for title in titles])
|
||||
|
||||
Logr.info(str(titles))
|
||||
|
||||
Logr.debug("------------------------------------------------------------")
|
||||
|
||||
root, tails = self.parse(titles)
|
||||
|
||||
Logr.debug("--------------------------PARSE-----------------------------")
|
||||
|
||||
for node in root:
|
||||
print_tree(node)
|
||||
|
||||
Logr.debug("--------------------------MERGE-----------------------------")
|
||||
|
||||
self.merge(root)
|
||||
|
||||
Logr.debug("--------------------------FINAL-----------------------------")
|
||||
|
||||
for node in root:
|
||||
print_tree(node)
|
||||
|
||||
Logr.debug("--------------------------RESULT-----------------------------")
|
||||
|
||||
scores = {}
|
||||
results = []
|
||||
|
||||
for tail in tails:
|
||||
score, value, original_value = tail.full_value()
|
||||
|
||||
if value in scores:
|
||||
scores[value] += score
|
||||
else:
|
||||
results.append((value, original_value))
|
||||
scores[value] = score
|
||||
|
||||
Logr.debug("%s %s %s", score, value, original_value)
|
||||
|
||||
sorted_results = sorted(results, key=lambda item: (scores[item[0]], item[1]), reverse = True)
|
||||
|
||||
return [result[0] for result in sorted_results]
|
||||
|
||||
def parse(self, titles):
|
||||
root = []
|
||||
tails = []
|
||||
|
||||
for title in titles:
|
||||
Logr.debug(title)
|
||||
|
||||
cur = None
|
||||
words = title.split(' ')
|
||||
|
||||
for wx in xrange(len(words)):
|
||||
word = strip(words[wx])
|
||||
|
||||
if cur is None:
|
||||
cur = find_node(root, word)
|
||||
|
||||
if cur is None:
|
||||
cur = DNode(word, None, num_children=len(words) - wx, original_value=title)
|
||||
root.append(cur)
|
||||
else:
|
||||
parent = cur
|
||||
parent.weight += 1
|
||||
|
||||
cur = find_node(parent.right, word)
|
||||
|
||||
if cur is None:
|
||||
Logr.debug("%s %d", word, len(words) - wx)
|
||||
cur = DNode(word, parent, num_children=len(words) - wx)
|
||||
sorted_append(parent.right, cur, lambda a: a.num_children < cur.num_children)
|
||||
else:
|
||||
cur.weight += 1
|
||||
|
||||
tails.append(cur)
|
||||
|
||||
return root, tails
|
||||
|
||||
def merge(self, root):
|
||||
for x in range(len(root)):
|
||||
Logr.debug(root[x])
|
||||
root[x].right = self._merge(root[x].right)
|
||||
Logr.debug('=================================================================')
|
||||
|
||||
return root
|
||||
|
||||
def get_nodes_right(self, value):
|
||||
if type(value) is not list:
|
||||
value = [value]
|
||||
|
||||
nodes = []
|
||||
|
||||
for node in value:
|
||||
nodes.append(node)
|
||||
|
||||
for child in self.get_nodes_right(node.right):
|
||||
nodes.append(child)
|
||||
|
||||
return nodes
|
||||
|
||||
def destroy_nodes_right(self, value):
|
||||
nodes = self.get_nodes_right(value)
|
||||
|
||||
for node in nodes:
|
||||
node.value = None
|
||||
node.dead = True
|
||||
|
||||
def _merge(self, nodes, depth = 0):
|
||||
Logr.debug(str('\t' * depth) + str(nodes))
|
||||
|
||||
top = nodes[0]
|
||||
|
||||
# Merge into top
|
||||
for x in range(len(nodes)):
|
||||
# Merge extra results into top
|
||||
if x > 0:
|
||||
top.value = None
|
||||
top.weight += nodes[x].weight
|
||||
self.destroy_nodes_right(top.right)
|
||||
|
||||
if len(nodes[x].right):
|
||||
top.join_right(nodes[x].right)
|
||||
|
||||
Logr.debug("= %s joined %s", nodes[x], top)
|
||||
|
||||
nodes[x].dead = True
|
||||
|
||||
nodes = [n for n in nodes if not n.dead]
|
||||
|
||||
# Traverse further
|
||||
for node in nodes:
|
||||
if len(node.right):
|
||||
node.right = self._merge(node.right, depth + 1)
|
||||
|
||||
return nodes
|
||||
|
||||
|
||||
def print_tree(node, depth = 0):
|
||||
Logr.debug(str('\t' * depth) + str(node))
|
||||
|
||||
if len(node.right):
|
||||
for child in node.right:
|
||||
print_tree(child, depth + 1)
|
||||
else:
|
||||
Logr.debug(node.full_value()[1])
|
||||
|
||||
|
||||
def find_node(node_list, value):
|
||||
# Try find adjacent node match
|
||||
for node in node_list:
|
||||
if node.value == value:
|
||||
return node
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class DNode(object):
|
||||
def __init__(self, value, parent, right=None, weight=1, num_children=None, original_value=None):
|
||||
self.value = value
|
||||
|
||||
self.parent = parent
|
||||
|
||||
if right is None:
|
||||
right = []
|
||||
self.right = right
|
||||
|
||||
self.weight = weight
|
||||
|
||||
self.original_value = original_value
|
||||
self.num_children = num_children
|
||||
|
||||
self.dead = False
|
||||
|
||||
def join_right(self, nodes):
|
||||
for node in nodes:
|
||||
duplicate = first(lambda x: x.value == node.value, self.right)
|
||||
|
||||
if duplicate:
|
||||
duplicate.weight += node.weight
|
||||
duplicate.join_right(node.right)
|
||||
else:
|
||||
node.parent = self
|
||||
self.right.append(node)
|
||||
|
||||
def full_value(self):
|
||||
words = []
|
||||
total_score = 0
|
||||
|
||||
cur = self
|
||||
root = None
|
||||
|
||||
while cur is not None:
|
||||
if cur.value and not cur.dead:
|
||||
words.insert(0, cur.value)
|
||||
total_score += cur.weight
|
||||
|
||||
if cur.parent is None:
|
||||
root = cur
|
||||
cur = cur.parent
|
||||
|
||||
return float(total_score) / len(words), ' '.join(words), root.original_value if root else None
|
||||
|
||||
def __repr__(self):
|
||||
return '<%s value:"%s", weight: %s, num_children: %s%s%s>' % (
|
||||
'DNode',
|
||||
self.value,
|
||||
self.weight,
|
||||
self.num_children,
|
||||
(', original_value: %s' % self.original_value) if self.original_value else '',
|
||||
' REMOVING' if self.dead else ''
|
||||
)
|
||||
280
libs/qcond/transformers/slice.py
Normal file
280
libs/qcond/transformers/slice.py
Normal file
@@ -0,0 +1,280 @@
|
||||
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
from logr import Logr
|
||||
from qcond.helpers import create_matcher
|
||||
from qcond.transformers.base import Transformer
|
||||
|
||||
|
||||
class SliceTransformer(Transformer):
|
||||
def __init__(self):
|
||||
super(SliceTransformer, self).__init__()
|
||||
|
||||
def run(self, titles):
|
||||
nodes = []
|
||||
|
||||
# Create a node for each title
|
||||
for title in titles:
|
||||
nodes.append(SimNode(title))
|
||||
|
||||
# Calculate similarities between nodes
|
||||
for node in nodes:
|
||||
calculate_sim_links(node, [n for n in nodes if n != node])
|
||||
|
||||
kill_nodes_above(nodes, 0.90)
|
||||
|
||||
Logr.debug('---------------------------------------------------------------------')
|
||||
|
||||
print_link_tree(nodes)
|
||||
Logr.debug('%s %s', len(nodes), [n.value for n in nodes])
|
||||
|
||||
Logr.debug('---------------------------------------------------------------------')
|
||||
|
||||
kill_trailing_nodes(nodes)
|
||||
|
||||
Logr.debug('---------------------------------------------------------------------')
|
||||
|
||||
# Sort remaining nodes by 'num_merges'
|
||||
nodes = sorted(nodes, key=lambda n: n.num_merges, reverse=True)
|
||||
|
||||
print_link_tree(nodes)
|
||||
|
||||
Logr.debug('---------------------------------------------------------------------')
|
||||
|
||||
Logr.debug('%s %s', len(nodes), [n.value for n in nodes])
|
||||
|
||||
return [n.value for n in nodes]
|
||||
|
||||
|
||||
class SimLink(object):
|
||||
def __init__(self, similarity, opcodes, stats):
|
||||
self.similarity = similarity
|
||||
self.opcodes = opcodes
|
||||
self.stats = stats
|
||||
|
||||
|
||||
class SimNode(object):
|
||||
def __init__(self, value):
|
||||
self.value = value
|
||||
|
||||
self.dead = False
|
||||
self.num_merges = 0
|
||||
|
||||
self.links = {} # {<other SimNode>: <SimLink>}
|
||||
|
||||
|
||||
def kill_nodes(nodes, killed_nodes):
|
||||
# Remove killed nodes from root list
|
||||
for node in killed_nodes:
|
||||
if node in nodes:
|
||||
nodes.remove(node)
|
||||
|
||||
# Remove killed nodes from links
|
||||
for killed_node in killed_nodes:
|
||||
for node in nodes:
|
||||
if killed_node in node.links:
|
||||
node.links.pop(killed_node)
|
||||
|
||||
|
||||
def kill_nodes_above(nodes, above_sim):
|
||||
killed_nodes = []
|
||||
|
||||
for node in nodes:
|
||||
if node.dead:
|
||||
continue
|
||||
|
||||
Logr.debug(node.value)
|
||||
|
||||
for link_node, link in node.links.items():
|
||||
if link_node.dead:
|
||||
continue
|
||||
|
||||
Logr.debug('\t%0.2f -- %s', link.similarity, link_node.value)
|
||||
|
||||
if link.similarity >= above_sim:
|
||||
if len(link_node.value) > len(node.value):
|
||||
Logr.debug('\t\tvery similar, killed this node')
|
||||
link_node.dead = True
|
||||
node.num_merges += 1
|
||||
killed_nodes.append(link_node)
|
||||
else:
|
||||
Logr.debug('\t\tvery similar, killed owner')
|
||||
node.dead = True
|
||||
link_node.num_merges += 1
|
||||
killed_nodes.append(node)
|
||||
|
||||
kill_nodes(nodes, killed_nodes)
|
||||
|
||||
|
||||
def print_link_tree(nodes):
|
||||
for node in nodes:
|
||||
Logr.debug(node.value)
|
||||
Logr.debug('\tnum_merges: %s', node.num_merges)
|
||||
|
||||
if len(node.links):
|
||||
Logr.debug('\t========== LINKS ==========')
|
||||
for link_node, link in node.links.items():
|
||||
Logr.debug('\t%0.2f -- %s', link.similarity, link_node.value)
|
||||
|
||||
Logr.debug('\t---------------------------')
|
||||
|
||||
|
||||
def kill_trailing_nodes(nodes):
|
||||
killed_nodes = []
|
||||
|
||||
for node in nodes:
|
||||
if node.dead:
|
||||
continue
|
||||
|
||||
Logr.debug(node.value)
|
||||
|
||||
for link_node, link in node.links.items():
|
||||
if link_node.dead:
|
||||
continue
|
||||
|
||||
is_valid = link.stats.get('valid', False)
|
||||
|
||||
has_deletions = False
|
||||
has_insertions = False
|
||||
has_replacements = False
|
||||
|
||||
for opcode in link.opcodes:
|
||||
if opcode[0] == 'delete':
|
||||
has_deletions = True
|
||||
if opcode[0] == 'insert':
|
||||
has_insertions = True
|
||||
if opcode[0] == 'replace':
|
||||
has_replacements = True
|
||||
|
||||
equal_perc = link.stats.get('equal', 0) / float(len(node.value))
|
||||
insert_perc = link.stats.get('insert', 0) / float(len(node.value))
|
||||
|
||||
Logr.debug('\t({0:<24}) [{1:02d}:{2:02d} = {3:02d} {4:3.0f}% {5:3.0f}%] -- {6:<45}'.format(
|
||||
'd:%s, i:%s, r:%s' % (has_deletions, has_insertions, has_replacements),
|
||||
len(node.value), len(link_node.value), link.stats.get('equal', 0),
|
||||
equal_perc * 100, insert_perc * 100,
|
||||
'"{0}"'.format(link_node.value)
|
||||
))
|
||||
|
||||
Logr.debug('\t\t%s', link.stats)
|
||||
|
||||
kill = all([
|
||||
is_valid,
|
||||
equal_perc >= 0.5,
|
||||
insert_perc < 2,
|
||||
has_insertions,
|
||||
not has_deletions,
|
||||
not has_replacements
|
||||
])
|
||||
|
||||
if kill:
|
||||
Logr.debug('\t\tkilled this node')
|
||||
|
||||
link_node.dead = True
|
||||
node.num_merges += 1
|
||||
killed_nodes.append(link_node)
|
||||
|
||||
kill_nodes(nodes, killed_nodes)
|
||||
|
||||
stats_print_format = "\t{0:<8} ({1:2d}:{2:2d}) ({3:2d}:{4:2d})"
|
||||
|
||||
|
||||
def get_index_values(iterable, a, b):
|
||||
return (
|
||||
iterable[a] if a else None,
|
||||
iterable[b] if b else None
|
||||
)
|
||||
|
||||
|
||||
def get_indices(iterable, a, b):
|
||||
return (
|
||||
a if 0 < a < len(iterable) else None,
|
||||
b if 0 < b < len(iterable) else None
|
||||
)
|
||||
|
||||
|
||||
def get_opcode_stats(for_node, node, opcodes):
|
||||
stats = {}
|
||||
|
||||
for tag, i1, i2, j1, j2 in opcodes:
|
||||
Logr.debug(stats_print_format.format(
|
||||
tag, i1, i2, j1, j2
|
||||
))
|
||||
|
||||
if tag in ['insert', 'delete']:
|
||||
ax = None, None
|
||||
bx = None, None
|
||||
|
||||
if tag == 'insert':
|
||||
ax = get_indices(for_node.value, i1 - 1, i1)
|
||||
bx = get_indices(node.value, j1, j2 - 1)
|
||||
|
||||
if tag == 'delete':
|
||||
ax = get_indices(for_node.value, j1 - 1, j1)
|
||||
bx = get_indices(node.value, i1, i2 - 1)
|
||||
|
||||
av = get_index_values(for_node.value, *ax)
|
||||
bv = get_index_values(node.value, *bx)
|
||||
|
||||
Logr.debug(
|
||||
'\t\t%s %s [%s><%s] <---> %s %s [%s><%s]',
|
||||
ax, av, av[0], av[1],
|
||||
bx, bv, bv[0], bv[1]
|
||||
)
|
||||
|
||||
head_valid = av[0] in [None, ' '] or bv[0] in [None, ' ']
|
||||
tail_valid = av[1] in [None, ' '] or bv[1] in [None, ' ']
|
||||
valid = head_valid and tail_valid
|
||||
|
||||
if 'valid' not in stats or (stats['valid'] and not valid):
|
||||
stats['valid'] = valid
|
||||
|
||||
Logr.debug('\t\t' + ('VALID' if valid else 'INVALID'))
|
||||
|
||||
if tag not in stats:
|
||||
stats[tag] = 0
|
||||
|
||||
stats[tag] += (i2 - i1) or (j2 - j1)
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def calculate_sim_links(for_node, other_nodes):
|
||||
for node in other_nodes:
|
||||
if node in for_node.links:
|
||||
continue
|
||||
|
||||
Logr.debug('calculating similarity between "%s" and "%s"', for_node.value, node.value)
|
||||
|
||||
# Get similarity
|
||||
similarity_matcher = create_matcher(for_node.value, node.value)
|
||||
similarity = similarity_matcher.quick_ratio()
|
||||
|
||||
# Get for_node -> node opcodes
|
||||
a_opcodes_matcher = create_matcher(for_node.value, node.value, swap_longest = False)
|
||||
a_opcodes = a_opcodes_matcher.get_opcodes()
|
||||
a_stats = get_opcode_stats(for_node, node, a_opcodes)
|
||||
|
||||
Logr.debug('-' * 100)
|
||||
|
||||
# Get node -> for_node opcodes
|
||||
b_opcodes_matcher = create_matcher(node.value, for_node.value, swap_longest = False)
|
||||
b_opcodes = b_opcodes_matcher.get_opcodes()
|
||||
b_stats = get_opcode_stats(for_node, node, b_opcodes)
|
||||
|
||||
for_node.links[node] = SimLink(similarity, a_opcodes, a_stats)
|
||||
node.links[for_node] = SimLink(similarity, b_opcodes, b_stats)
|
||||
|
||||
#raw_input('Press ENTER to continue')
|
||||
26
libs/qcond/transformers/strip_common.py
Normal file
26
libs/qcond/transformers/strip_common.py
Normal file
@@ -0,0 +1,26 @@
|
||||
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
from qcond.transformers.base import Transformer
|
||||
|
||||
|
||||
COMMON_WORDS = [
|
||||
'the'
|
||||
]
|
||||
|
||||
|
||||
class StripCommonTransformer(Transformer):
|
||||
def run(self, titles):
|
||||
return [title for title in titles if title.lower() not in COMMON_WORDS]
|
||||
Reference in New Issue
Block a user