diff --git a/libs/qcond/__init__.py b/libs/qcond/__init__.py new file mode 100644 index 00000000..be64e7bd --- /dev/null +++ b/libs/qcond/__init__.py @@ -0,0 +1,42 @@ +# Copyright 2013 Dean Gardiner +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from qcond.transformers.merge import MergeTransformer +from qcond.transformers.slice import SliceTransformer +from qcond.transformers.strip_common import StripCommonTransformer + + +__version_info__ = ('0', '1', '0') +__version_branch__ = 'master' + +__version__ = "%s%s" % ( + '.'.join(__version_info__), + '-' + __version_branch__ if __version_branch__ else '' +) + + +class QueryCondenser(object): + def __init__(self): + self.transformers = [ + MergeTransformer(), + SliceTransformer(), + StripCommonTransformer() + ] + + def distinct(self, titles): + for transformer in self.transformers: + titles = transformer.run(titles) + + return titles diff --git a/libs/qcond/compat.py b/libs/qcond/compat.py new file mode 100644 index 00000000..f3f09255 --- /dev/null +++ b/libs/qcond/compat.py @@ -0,0 +1,23 @@ +# Copyright 2013 Dean Gardiner +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import sys + +PY3 = sys.version_info[0] == 3 + +if PY3: + xrange = range +else: + xrange = xrange diff --git a/libs/qcond/helpers.py b/libs/qcond/helpers.py new file mode 100644 index 00000000..a341b6e7 --- /dev/null +++ b/libs/qcond/helpers.py @@ -0,0 +1,84 @@ +# Copyright 2013 Dean Gardiner +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from difflib import SequenceMatcher +import re +import sys +from logr import Logr +from qcond.compat import xrange + + +PY3 = sys.version_info[0] == 3 + + +def simplify(s): + s = s.lower() + s = re.sub(r"(\w)'(\w)", r"\1\2", s) + return s + + +def strip(s): + return re.sub(r"^(\W*)(.*?)(\W*)$", r"\2", s) + + +def create_matcher(a, b, swap_longest = True, case_sensitive = False): + # Ensure longest string is a + if swap_longest and len(b) > len(a): + a_ = a + a = b + b = a_ + + if not case_sensitive: + a = a.upper() + b = b.upper() + + return SequenceMatcher(None, a, b) + + +def first(function_or_none, sequence): + if PY3: + for item in filter(function_or_none, sequence): + return item + else: + result = filter(function_or_none, sequence) + if len(result): + return result[0] + + return None + +def sorted_append(sequence, item, func): + if not len(sequence): + sequence.insert(0, item) + return + + x = 0 + for x in xrange(len(sequence)): + if func(sequence[x]): + sequence.insert(x, item) + return + + sequence.append(item) + +def itemsMatch(L1, L2): + return len(L1) == len(L2) and sorted(L1) == sorted(L2) + +def distinct(sequence): + result = [] + + for item in sequence: + if item not in result: + result.append(item) + + return result \ No newline at end of file diff --git a/libs/qcond/transformers/__init__.py b/libs/qcond/transformers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/libs/qcond/transformers/base.py b/libs/qcond/transformers/base.py new file mode 100644 index 00000000..7054729f --- /dev/null +++ b/libs/qcond/transformers/base.py @@ -0,0 +1,21 @@ +# Copyright 2013 Dean Gardiner +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class Transformer(object): + def __init__(self): + pass + + def run(self, titles): + raise NotImplementedError() diff --git a/libs/qcond/transformers/merge.py b/libs/qcond/transformers/merge.py new file mode 100644 index 00000000..d82f249f --- /dev/null +++ b/libs/qcond/transformers/merge.py @@ -0,0 +1,238 @@ +# Copyright 2013 Dean Gardiner +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from operator import itemgetter +from logr import Logr +from qcond.helpers import simplify, strip, first, sorted_append, distinct +from qcond.transformers.base import Transformer +from qcond.compat import xrange + + +class MergeTransformer(Transformer): + def __init__(self): + super(MergeTransformer, self).__init__() + + def run(self, titles): + titles = distinct([simplify(title) for title in titles]) + + Logr.info(str(titles)) + + Logr.debug("------------------------------------------------------------") + + root, tails = self.parse(titles) + + Logr.debug("--------------------------PARSE-----------------------------") + + for node in root: + print_tree(node) + + Logr.debug("--------------------------MERGE-----------------------------") + + self.merge(root) + + Logr.debug("--------------------------FINAL-----------------------------") + + for node in root: + print_tree(node) + + Logr.debug("--------------------------RESULT-----------------------------") + + scores = {} + results = [] + + for tail in tails: + score, value, original_value = tail.full_value() + + if value in scores: + scores[value] += score + else: + results.append((value, original_value)) + scores[value] = score + + Logr.debug("%s %s %s", score, value, original_value) + + sorted_results = sorted(results, key=lambda item: (scores[item[0]], item[1]), reverse = True) + + return [result[0] for result in sorted_results] + + def parse(self, titles): + root = [] + tails = [] + + for title in titles: + Logr.debug(title) + + cur = None + words = title.split(' ') + + for wx in xrange(len(words)): + word = strip(words[wx]) + + if cur is None: + cur = find_node(root, word) + + if cur is None: + cur = DNode(word, None, num_children=len(words) - wx, original_value=title) + root.append(cur) + else: + parent = cur + parent.weight += 1 + + cur = find_node(parent.right, word) + + if cur is None: + Logr.debug("%s %d", word, len(words) - wx) + cur = DNode(word, parent, num_children=len(words) - wx) + sorted_append(parent.right, cur, lambda a: a.num_children < cur.num_children) + else: + cur.weight += 1 + + tails.append(cur) + + return root, tails + + def merge(self, root): + for x in range(len(root)): + Logr.debug(root[x]) + root[x].right = self._merge(root[x].right) + Logr.debug('=================================================================') + + return root + + def get_nodes_right(self, value): + if type(value) is not list: + value = [value] + + nodes = [] + + for node in value: + nodes.append(node) + + for child in self.get_nodes_right(node.right): + nodes.append(child) + + return nodes + + def destroy_nodes_right(self, value): + nodes = self.get_nodes_right(value) + + for node in nodes: + node.value = None + node.dead = True + + def _merge(self, nodes, depth = 0): + Logr.debug(str('\t' * depth) + str(nodes)) + + top = nodes[0] + + # Merge into top + for x in range(len(nodes)): + # Merge extra results into top + if x > 0: + top.value = None + top.weight += nodes[x].weight + self.destroy_nodes_right(top.right) + + if len(nodes[x].right): + top.join_right(nodes[x].right) + + Logr.debug("= %s joined %s", nodes[x], top) + + nodes[x].dead = True + + nodes = [n for n in nodes if not n.dead] + + # Traverse further + for node in nodes: + if len(node.right): + node.right = self._merge(node.right, depth + 1) + + return nodes + + +def print_tree(node, depth = 0): + Logr.debug(str('\t' * depth) + str(node)) + + if len(node.right): + for child in node.right: + print_tree(child, depth + 1) + else: + Logr.debug(node.full_value()[1]) + + +def find_node(node_list, value): + # Try find adjacent node match + for node in node_list: + if node.value == value: + return node + + return None + + +class DNode(object): + def __init__(self, value, parent, right=None, weight=1, num_children=None, original_value=None): + self.value = value + + self.parent = parent + + if right is None: + right = [] + self.right = right + + self.weight = weight + + self.original_value = original_value + self.num_children = num_children + + self.dead = False + + def join_right(self, nodes): + for node in nodes: + duplicate = first(lambda x: x.value == node.value, self.right) + + if duplicate: + duplicate.weight += node.weight + duplicate.join_right(node.right) + else: + node.parent = self + self.right.append(node) + + def full_value(self): + words = [] + total_score = 0 + + cur = self + root = None + + while cur is not None: + if cur.value and not cur.dead: + words.insert(0, cur.value) + total_score += cur.weight + + if cur.parent is None: + root = cur + cur = cur.parent + + return float(total_score) / len(words), ' '.join(words), root.original_value if root else None + + def __repr__(self): + return '<%s value:"%s", weight: %s, num_children: %s%s%s>' % ( + 'DNode', + self.value, + self.weight, + self.num_children, + (', original_value: %s' % self.original_value) if self.original_value else '', + ' REMOVING' if self.dead else '' + ) diff --git a/libs/qcond/transformers/slice.py b/libs/qcond/transformers/slice.py new file mode 100644 index 00000000..864f6739 --- /dev/null +++ b/libs/qcond/transformers/slice.py @@ -0,0 +1,280 @@ +# Copyright 2013 Dean Gardiner +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from logr import Logr +from qcond.helpers import create_matcher +from qcond.transformers.base import Transformer + + +class SliceTransformer(Transformer): + def __init__(self): + super(SliceTransformer, self).__init__() + + def run(self, titles): + nodes = [] + + # Create a node for each title + for title in titles: + nodes.append(SimNode(title)) + + # Calculate similarities between nodes + for node in nodes: + calculate_sim_links(node, [n for n in nodes if n != node]) + + kill_nodes_above(nodes, 0.90) + + Logr.debug('---------------------------------------------------------------------') + + print_link_tree(nodes) + Logr.debug('%s %s', len(nodes), [n.value for n in nodes]) + + Logr.debug('---------------------------------------------------------------------') + + kill_trailing_nodes(nodes) + + Logr.debug('---------------------------------------------------------------------') + + # Sort remaining nodes by 'num_merges' + nodes = sorted(nodes, key=lambda n: n.num_merges, reverse=True) + + print_link_tree(nodes) + + Logr.debug('---------------------------------------------------------------------') + + Logr.debug('%s %s', len(nodes), [n.value for n in nodes]) + + return [n.value for n in nodes] + + +class SimLink(object): + def __init__(self, similarity, opcodes, stats): + self.similarity = similarity + self.opcodes = opcodes + self.stats = stats + + +class SimNode(object): + def __init__(self, value): + self.value = value + + self.dead = False + self.num_merges = 0 + + self.links = {} # {: } + + +def kill_nodes(nodes, killed_nodes): + # Remove killed nodes from root list + for node in killed_nodes: + if node in nodes: + nodes.remove(node) + + # Remove killed nodes from links + for killed_node in killed_nodes: + for node in nodes: + if killed_node in node.links: + node.links.pop(killed_node) + + +def kill_nodes_above(nodes, above_sim): + killed_nodes = [] + + for node in nodes: + if node.dead: + continue + + Logr.debug(node.value) + + for link_node, link in node.links.items(): + if link_node.dead: + continue + + Logr.debug('\t%0.2f -- %s', link.similarity, link_node.value) + + if link.similarity >= above_sim: + if len(link_node.value) > len(node.value): + Logr.debug('\t\tvery similar, killed this node') + link_node.dead = True + node.num_merges += 1 + killed_nodes.append(link_node) + else: + Logr.debug('\t\tvery similar, killed owner') + node.dead = True + link_node.num_merges += 1 + killed_nodes.append(node) + + kill_nodes(nodes, killed_nodes) + + +def print_link_tree(nodes): + for node in nodes: + Logr.debug(node.value) + Logr.debug('\tnum_merges: %s', node.num_merges) + + if len(node.links): + Logr.debug('\t========== LINKS ==========') + for link_node, link in node.links.items(): + Logr.debug('\t%0.2f -- %s', link.similarity, link_node.value) + + Logr.debug('\t---------------------------') + + +def kill_trailing_nodes(nodes): + killed_nodes = [] + + for node in nodes: + if node.dead: + continue + + Logr.debug(node.value) + + for link_node, link in node.links.items(): + if link_node.dead: + continue + + is_valid = link.stats.get('valid', False) + + has_deletions = False + has_insertions = False + has_replacements = False + + for opcode in link.opcodes: + if opcode[0] == 'delete': + has_deletions = True + if opcode[0] == 'insert': + has_insertions = True + if opcode[0] == 'replace': + has_replacements = True + + equal_perc = link.stats.get('equal', 0) / float(len(node.value)) + insert_perc = link.stats.get('insert', 0) / float(len(node.value)) + + Logr.debug('\t({0:<24}) [{1:02d}:{2:02d} = {3:02d} {4:3.0f}% {5:3.0f}%] -- {6:<45}'.format( + 'd:%s, i:%s, r:%s' % (has_deletions, has_insertions, has_replacements), + len(node.value), len(link_node.value), link.stats.get('equal', 0), + equal_perc * 100, insert_perc * 100, + '"{0}"'.format(link_node.value) + )) + + Logr.debug('\t\t%s', link.stats) + + kill = all([ + is_valid, + equal_perc >= 0.5, + insert_perc < 2, + has_insertions, + not has_deletions, + not has_replacements + ]) + + if kill: + Logr.debug('\t\tkilled this node') + + link_node.dead = True + node.num_merges += 1 + killed_nodes.append(link_node) + + kill_nodes(nodes, killed_nodes) + +stats_print_format = "\t{0:<8} ({1:2d}:{2:2d}) ({3:2d}:{4:2d})" + + +def get_index_values(iterable, a, b): + return ( + iterable[a] if a else None, + iterable[b] if b else None + ) + + +def get_indices(iterable, a, b): + return ( + a if 0 < a < len(iterable) else None, + b if 0 < b < len(iterable) else None + ) + + +def get_opcode_stats(for_node, node, opcodes): + stats = {} + + for tag, i1, i2, j1, j2 in opcodes: + Logr.debug(stats_print_format.format( + tag, i1, i2, j1, j2 + )) + + if tag in ['insert', 'delete']: + ax = None, None + bx = None, None + + if tag == 'insert': + ax = get_indices(for_node.value, i1 - 1, i1) + bx = get_indices(node.value, j1, j2 - 1) + + if tag == 'delete': + ax = get_indices(for_node.value, j1 - 1, j1) + bx = get_indices(node.value, i1, i2 - 1) + + av = get_index_values(for_node.value, *ax) + bv = get_index_values(node.value, *bx) + + Logr.debug( + '\t\t%s %s [%s><%s] <---> %s %s [%s><%s]', + ax, av, av[0], av[1], + bx, bv, bv[0], bv[1] + ) + + head_valid = av[0] in [None, ' '] or bv[0] in [None, ' '] + tail_valid = av[1] in [None, ' '] or bv[1] in [None, ' '] + valid = head_valid and tail_valid + + if 'valid' not in stats or (stats['valid'] and not valid): + stats['valid'] = valid + + Logr.debug('\t\t' + ('VALID' if valid else 'INVALID')) + + if tag not in stats: + stats[tag] = 0 + + stats[tag] += (i2 - i1) or (j2 - j1) + + return stats + + +def calculate_sim_links(for_node, other_nodes): + for node in other_nodes: + if node in for_node.links: + continue + + Logr.debug('calculating similarity between "%s" and "%s"', for_node.value, node.value) + + # Get similarity + similarity_matcher = create_matcher(for_node.value, node.value) + similarity = similarity_matcher.quick_ratio() + + # Get for_node -> node opcodes + a_opcodes_matcher = create_matcher(for_node.value, node.value, swap_longest = False) + a_opcodes = a_opcodes_matcher.get_opcodes() + a_stats = get_opcode_stats(for_node, node, a_opcodes) + + Logr.debug('-' * 100) + + # Get node -> for_node opcodes + b_opcodes_matcher = create_matcher(node.value, for_node.value, swap_longest = False) + b_opcodes = b_opcodes_matcher.get_opcodes() + b_stats = get_opcode_stats(for_node, node, b_opcodes) + + for_node.links[node] = SimLink(similarity, a_opcodes, a_stats) + node.links[for_node] = SimLink(similarity, b_opcodes, b_stats) + + #raw_input('Press ENTER to continue') diff --git a/libs/qcond/transformers/strip_common.py b/libs/qcond/transformers/strip_common.py new file mode 100644 index 00000000..47b84015 --- /dev/null +++ b/libs/qcond/transformers/strip_common.py @@ -0,0 +1,26 @@ +# Copyright 2013 Dean Gardiner +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from qcond.transformers.base import Transformer + + +COMMON_WORDS = [ + 'the' +] + + +class StripCommonTransformer(Transformer): + def run(self, titles): + return [title for title in titles if title.lower() not in COMMON_WORDS]