Added qcond (Query Condenser) v0.1.0 library - https://github.com/fuzeman/QueryCondenser

This commit is contained in:
Dean Gardiner
2013-10-15 16:16:26 +13:00
parent 07abf7c83d
commit f23412ea7e
8 changed files with 714 additions and 0 deletions

42
libs/qcond/__init__.py Normal file
View File

@@ -0,0 +1,42 @@
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from qcond.transformers.merge import MergeTransformer
from qcond.transformers.slice import SliceTransformer
from qcond.transformers.strip_common import StripCommonTransformer
__version_info__ = ('0', '1', '0')
__version_branch__ = 'master'
__version__ = "%s%s" % (
'.'.join(__version_info__),
'-' + __version_branch__ if __version_branch__ else ''
)
class QueryCondenser(object):
def __init__(self):
self.transformers = [
MergeTransformer(),
SliceTransformer(),
StripCommonTransformer()
]
def distinct(self, titles):
for transformer in self.transformers:
titles = transformer.run(titles)
return titles

23
libs/qcond/compat.py Normal file
View File

@@ -0,0 +1,23 @@
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
PY3 = sys.version_info[0] == 3
if PY3:
xrange = range
else:
xrange = xrange

84
libs/qcond/helpers.py Normal file
View File

@@ -0,0 +1,84 @@
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from difflib import SequenceMatcher
import re
import sys
from logr import Logr
from qcond.compat import xrange
PY3 = sys.version_info[0] == 3
def simplify(s):
s = s.lower()
s = re.sub(r"(\w)'(\w)", r"\1\2", s)
return s
def strip(s):
return re.sub(r"^(\W*)(.*?)(\W*)$", r"\2", s)
def create_matcher(a, b, swap_longest = True, case_sensitive = False):
# Ensure longest string is a
if swap_longest and len(b) > len(a):
a_ = a
a = b
b = a_
if not case_sensitive:
a = a.upper()
b = b.upper()
return SequenceMatcher(None, a, b)
def first(function_or_none, sequence):
if PY3:
for item in filter(function_or_none, sequence):
return item
else:
result = filter(function_or_none, sequence)
if len(result):
return result[0]
return None
def sorted_append(sequence, item, func):
if not len(sequence):
sequence.insert(0, item)
return
x = 0
for x in xrange(len(sequence)):
if func(sequence[x]):
sequence.insert(x, item)
return
sequence.append(item)
def itemsMatch(L1, L2):
return len(L1) == len(L2) and sorted(L1) == sorted(L2)
def distinct(sequence):
result = []
for item in sequence:
if item not in result:
result.append(item)
return result

View File

View File

@@ -0,0 +1,21 @@
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
class Transformer(object):
def __init__(self):
pass
def run(self, titles):
raise NotImplementedError()

View File

@@ -0,0 +1,238 @@
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from operator import itemgetter
from logr import Logr
from qcond.helpers import simplify, strip, first, sorted_append, distinct
from qcond.transformers.base import Transformer
from qcond.compat import xrange
class MergeTransformer(Transformer):
def __init__(self):
super(MergeTransformer, self).__init__()
def run(self, titles):
titles = distinct([simplify(title) for title in titles])
Logr.info(str(titles))
Logr.debug("------------------------------------------------------------")
root, tails = self.parse(titles)
Logr.debug("--------------------------PARSE-----------------------------")
for node in root:
print_tree(node)
Logr.debug("--------------------------MERGE-----------------------------")
self.merge(root)
Logr.debug("--------------------------FINAL-----------------------------")
for node in root:
print_tree(node)
Logr.debug("--------------------------RESULT-----------------------------")
scores = {}
results = []
for tail in tails:
score, value, original_value = tail.full_value()
if value in scores:
scores[value] += score
else:
results.append((value, original_value))
scores[value] = score
Logr.debug("%s %s %s", score, value, original_value)
sorted_results = sorted(results, key=lambda item: (scores[item[0]], item[1]), reverse = True)
return [result[0] for result in sorted_results]
def parse(self, titles):
root = []
tails = []
for title in titles:
Logr.debug(title)
cur = None
words = title.split(' ')
for wx in xrange(len(words)):
word = strip(words[wx])
if cur is None:
cur = find_node(root, word)
if cur is None:
cur = DNode(word, None, num_children=len(words) - wx, original_value=title)
root.append(cur)
else:
parent = cur
parent.weight += 1
cur = find_node(parent.right, word)
if cur is None:
Logr.debug("%s %d", word, len(words) - wx)
cur = DNode(word, parent, num_children=len(words) - wx)
sorted_append(parent.right, cur, lambda a: a.num_children < cur.num_children)
else:
cur.weight += 1
tails.append(cur)
return root, tails
def merge(self, root):
for x in range(len(root)):
Logr.debug(root[x])
root[x].right = self._merge(root[x].right)
Logr.debug('=================================================================')
return root
def get_nodes_right(self, value):
if type(value) is not list:
value = [value]
nodes = []
for node in value:
nodes.append(node)
for child in self.get_nodes_right(node.right):
nodes.append(child)
return nodes
def destroy_nodes_right(self, value):
nodes = self.get_nodes_right(value)
for node in nodes:
node.value = None
node.dead = True
def _merge(self, nodes, depth = 0):
Logr.debug(str('\t' * depth) + str(nodes))
top = nodes[0]
# Merge into top
for x in range(len(nodes)):
# Merge extra results into top
if x > 0:
top.value = None
top.weight += nodes[x].weight
self.destroy_nodes_right(top.right)
if len(nodes[x].right):
top.join_right(nodes[x].right)
Logr.debug("= %s joined %s", nodes[x], top)
nodes[x].dead = True
nodes = [n for n in nodes if not n.dead]
# Traverse further
for node in nodes:
if len(node.right):
node.right = self._merge(node.right, depth + 1)
return nodes
def print_tree(node, depth = 0):
Logr.debug(str('\t' * depth) + str(node))
if len(node.right):
for child in node.right:
print_tree(child, depth + 1)
else:
Logr.debug(node.full_value()[1])
def find_node(node_list, value):
# Try find adjacent node match
for node in node_list:
if node.value == value:
return node
return None
class DNode(object):
def __init__(self, value, parent, right=None, weight=1, num_children=None, original_value=None):
self.value = value
self.parent = parent
if right is None:
right = []
self.right = right
self.weight = weight
self.original_value = original_value
self.num_children = num_children
self.dead = False
def join_right(self, nodes):
for node in nodes:
duplicate = first(lambda x: x.value == node.value, self.right)
if duplicate:
duplicate.weight += node.weight
duplicate.join_right(node.right)
else:
node.parent = self
self.right.append(node)
def full_value(self):
words = []
total_score = 0
cur = self
root = None
while cur is not None:
if cur.value and not cur.dead:
words.insert(0, cur.value)
total_score += cur.weight
if cur.parent is None:
root = cur
cur = cur.parent
return float(total_score) / len(words), ' '.join(words), root.original_value if root else None
def __repr__(self):
return '<%s value:"%s", weight: %s, num_children: %s%s%s>' % (
'DNode',
self.value,
self.weight,
self.num_children,
(', original_value: %s' % self.original_value) if self.original_value else '',
' REMOVING' if self.dead else ''
)

View File

@@ -0,0 +1,280 @@
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from logr import Logr
from qcond.helpers import create_matcher
from qcond.transformers.base import Transformer
class SliceTransformer(Transformer):
def __init__(self):
super(SliceTransformer, self).__init__()
def run(self, titles):
nodes = []
# Create a node for each title
for title in titles:
nodes.append(SimNode(title))
# Calculate similarities between nodes
for node in nodes:
calculate_sim_links(node, [n for n in nodes if n != node])
kill_nodes_above(nodes, 0.90)
Logr.debug('---------------------------------------------------------------------')
print_link_tree(nodes)
Logr.debug('%s %s', len(nodes), [n.value for n in nodes])
Logr.debug('---------------------------------------------------------------------')
kill_trailing_nodes(nodes)
Logr.debug('---------------------------------------------------------------------')
# Sort remaining nodes by 'num_merges'
nodes = sorted(nodes, key=lambda n: n.num_merges, reverse=True)
print_link_tree(nodes)
Logr.debug('---------------------------------------------------------------------')
Logr.debug('%s %s', len(nodes), [n.value for n in nodes])
return [n.value for n in nodes]
class SimLink(object):
def __init__(self, similarity, opcodes, stats):
self.similarity = similarity
self.opcodes = opcodes
self.stats = stats
class SimNode(object):
def __init__(self, value):
self.value = value
self.dead = False
self.num_merges = 0
self.links = {} # {<other SimNode>: <SimLink>}
def kill_nodes(nodes, killed_nodes):
# Remove killed nodes from root list
for node in killed_nodes:
if node in nodes:
nodes.remove(node)
# Remove killed nodes from links
for killed_node in killed_nodes:
for node in nodes:
if killed_node in node.links:
node.links.pop(killed_node)
def kill_nodes_above(nodes, above_sim):
killed_nodes = []
for node in nodes:
if node.dead:
continue
Logr.debug(node.value)
for link_node, link in node.links.items():
if link_node.dead:
continue
Logr.debug('\t%0.2f -- %s', link.similarity, link_node.value)
if link.similarity >= above_sim:
if len(link_node.value) > len(node.value):
Logr.debug('\t\tvery similar, killed this node')
link_node.dead = True
node.num_merges += 1
killed_nodes.append(link_node)
else:
Logr.debug('\t\tvery similar, killed owner')
node.dead = True
link_node.num_merges += 1
killed_nodes.append(node)
kill_nodes(nodes, killed_nodes)
def print_link_tree(nodes):
for node in nodes:
Logr.debug(node.value)
Logr.debug('\tnum_merges: %s', node.num_merges)
if len(node.links):
Logr.debug('\t========== LINKS ==========')
for link_node, link in node.links.items():
Logr.debug('\t%0.2f -- %s', link.similarity, link_node.value)
Logr.debug('\t---------------------------')
def kill_trailing_nodes(nodes):
killed_nodes = []
for node in nodes:
if node.dead:
continue
Logr.debug(node.value)
for link_node, link in node.links.items():
if link_node.dead:
continue
is_valid = link.stats.get('valid', False)
has_deletions = False
has_insertions = False
has_replacements = False
for opcode in link.opcodes:
if opcode[0] == 'delete':
has_deletions = True
if opcode[0] == 'insert':
has_insertions = True
if opcode[0] == 'replace':
has_replacements = True
equal_perc = link.stats.get('equal', 0) / float(len(node.value))
insert_perc = link.stats.get('insert', 0) / float(len(node.value))
Logr.debug('\t({0:<24}) [{1:02d}:{2:02d} = {3:02d} {4:3.0f}% {5:3.0f}%] -- {6:<45}'.format(
'd:%s, i:%s, r:%s' % (has_deletions, has_insertions, has_replacements),
len(node.value), len(link_node.value), link.stats.get('equal', 0),
equal_perc * 100, insert_perc * 100,
'"{0}"'.format(link_node.value)
))
Logr.debug('\t\t%s', link.stats)
kill = all([
is_valid,
equal_perc >= 0.5,
insert_perc < 2,
has_insertions,
not has_deletions,
not has_replacements
])
if kill:
Logr.debug('\t\tkilled this node')
link_node.dead = True
node.num_merges += 1
killed_nodes.append(link_node)
kill_nodes(nodes, killed_nodes)
stats_print_format = "\t{0:<8} ({1:2d}:{2:2d}) ({3:2d}:{4:2d})"
def get_index_values(iterable, a, b):
return (
iterable[a] if a else None,
iterable[b] if b else None
)
def get_indices(iterable, a, b):
return (
a if 0 < a < len(iterable) else None,
b if 0 < b < len(iterable) else None
)
def get_opcode_stats(for_node, node, opcodes):
stats = {}
for tag, i1, i2, j1, j2 in opcodes:
Logr.debug(stats_print_format.format(
tag, i1, i2, j1, j2
))
if tag in ['insert', 'delete']:
ax = None, None
bx = None, None
if tag == 'insert':
ax = get_indices(for_node.value, i1 - 1, i1)
bx = get_indices(node.value, j1, j2 - 1)
if tag == 'delete':
ax = get_indices(for_node.value, j1 - 1, j1)
bx = get_indices(node.value, i1, i2 - 1)
av = get_index_values(for_node.value, *ax)
bv = get_index_values(node.value, *bx)
Logr.debug(
'\t\t%s %s [%s><%s] <---> %s %s [%s><%s]',
ax, av, av[0], av[1],
bx, bv, bv[0], bv[1]
)
head_valid = av[0] in [None, ' '] or bv[0] in [None, ' ']
tail_valid = av[1] in [None, ' '] or bv[1] in [None, ' ']
valid = head_valid and tail_valid
if 'valid' not in stats or (stats['valid'] and not valid):
stats['valid'] = valid
Logr.debug('\t\t' + ('VALID' if valid else 'INVALID'))
if tag not in stats:
stats[tag] = 0
stats[tag] += (i2 - i1) or (j2 - j1)
return stats
def calculate_sim_links(for_node, other_nodes):
for node in other_nodes:
if node in for_node.links:
continue
Logr.debug('calculating similarity between "%s" and "%s"', for_node.value, node.value)
# Get similarity
similarity_matcher = create_matcher(for_node.value, node.value)
similarity = similarity_matcher.quick_ratio()
# Get for_node -> node opcodes
a_opcodes_matcher = create_matcher(for_node.value, node.value, swap_longest = False)
a_opcodes = a_opcodes_matcher.get_opcodes()
a_stats = get_opcode_stats(for_node, node, a_opcodes)
Logr.debug('-' * 100)
# Get node -> for_node opcodes
b_opcodes_matcher = create_matcher(node.value, for_node.value, swap_longest = False)
b_opcodes = b_opcodes_matcher.get_opcodes()
b_stats = get_opcode_stats(for_node, node, b_opcodes)
for_node.links[node] = SimLink(similarity, a_opcodes, a_stats)
node.links[for_node] = SimLink(similarity, b_opcodes, b_stats)
#raw_input('Press ENTER to continue')

View File

@@ -0,0 +1,26 @@
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from qcond.transformers.base import Transformer
COMMON_WORDS = [
'the'
]
class StripCommonTransformer(Transformer):
def run(self, titles):
return [title for title in titles if title.lower() not in COMMON_WORDS]