atrsimilar.review.inc

<?php
// $Id: atrsimilar.review.inc,v 1.1.2.4 2009/10/07 00:35:06 xano Exp $

/**
 * @file
 *   Review strings.
 */

/**
 * Find (almost) duplicate strings.
 *
 * @param $profile
 *   The settings profile used for this review.
 * @param $context
 *   A batch operation context.
 */
function atrsimilar_review($review, $profile, &$context) {
  // Get and set values necessary for this operation.
  if (!isset($context['sandbox']['from'])) {
    $context['sandbox']['from'] = 0;
    $context['sandbox']['count'] = db_result(db_query("SELECT COUNT(*) FROM {atr_string} WHERE rid = %d", $review->rid));
  }
  $from = $context['sandbox']['from'];
  $count = $context['sandbox']['count'];

  $context['message'] = t('Checked @done of @total strings for similarity.', array('@done' => $from, '@total' => $count));

  $strings = $sids = array();
  $result = db_query_range("SELECT sid, string FROM {atr_string} WHERE rid = %d", $review->rid, $from, $count);
  while ($string_data = db_fetch_object($result)) {
    $strings[] = $string_data->string;
    $sids[] = $string_data->sid;
  }
  // The string count of this run only.
  $count_run = count($strings);

  // Calculate how many strings we're going to compare during this run.
  // Try the first string at least.
  $i_max = 0;
  // The amount of comparisons we're going to do during this run.
  $comparisons = $count_run - 1;
  // Do a maximum of 10,000 comparisons per run.
  $comparisons_per_run = 10000;
  // The last string has been compared to all previous strings already.
  while ($i_max < $count_run - 2) {
    // The amount of comparisons required for another string during this run.
    $comparisons_next_string = $count_run - $i_max - 2;
    if ($comparisons + $comparisons_next_string < $comparisons_per_run) {
      $comparisons += $comparisons_next_string;
      $i_max++;
    }
    else {
      break;
    }
  }

  $values = array();
  $threshold = (int) variable_get('atrsimilar_profile_' . $profile->pid . '_threshold', 90);
  for ($i = 0; $i <= $i_max; $i++) {
    for ($j = $i + 1; $j < $count_run; $j++) {
      $similarity = atrsimilar_similarity($strings[$i], $strings[$j]);
      if ($similarity > $threshold) {
        $values[] = $sids[$i];
        $values[] = $sids[$j];
        $values[] = $similarity;
      }
    }
  }
  // Save matches to the database.
  if ($matches = count($values) / 3) {
    $placeholders = implode(',', array_fill(0, $matches, '(%d, %d, %d)'));
    db_query("INSERT INTO {atrsimilar_string} VALUES " . $placeholders, $values);
  }

  // Inform Batch API we are not yet finished and provide an estimation of the
  // completion level we reached.
  if ($i_max < $count_run - 2) {
    $context['sandbox']['from'] = $from + $i_max + 1;
    $context['finished'] = $from / $count;
  }
}

/**
 * Find two strings their similarity.
 *
 * @param $string_a
 *   The first string.
 * @param $string_b
 *   The second string.
 *
 * @return
 *   The strings' similarity in percents.
 */
function atrsimilar_similarity($string_a, $string_b) {
  $diff = atrsimilar_diff($string_a, $string_b);
  $length = array(drupal_strlen($string_a), drupal_strlen($string_b));
  $length_common = 0;
  foreach ($diff[0] as $token) {
    if ($token['common']) {
      $length_common += drupal_strlen($token['string']);
    }
  }

  // We define the total amount of characters as the total length of the common
  // subsequences and all characters that are different.
  $total = $length[0] + $length[1] - $length_common;

  return $similarity = $length_common / $total * 100;
}

/**
 * Compute the differences between two strings.
 */
function atrsimilar_diff($old, $new) {
  $tokens = atrsimilar_tokenize($old, $new);
  $prefix = atrsimilar_prefix($tokens);
  $suffix = atrsimilar_suffix($tokens);
  $length[0] = count($tokens[0]);
  $length[1] = count($tokens[1]);
  $matrix = atrsimilar_lcs_matrix($tokens, $length);
  atrsimilar_diff_build($tokens, $matrix, $length);

  // Add the previously trimmed prefix and suffix.
  foreach (array(0, 1) as $i) {
    if ($prefix) {
      array_unshift($tokens[$i], atrsimilar_expand_token($prefix, TRUE));
    }
    if ($suffix) {
      array_push($tokens[$i], atrsimilar_expand_token($suffix, TRUE));
    }
  }

  return array(atrsimilar_diff_merge($tokens[0]), atrsimilar_diff_merge($tokens[1]));
}

/**
 * Tokenize the old and new strings.
 */
function atrsimilar_tokenize($old, $new) {
  $tokens = array();
  foreach(array($old, $new) as $i => $string) {
    $tokens[$i] = preg_split('/(?<=\W)|(?=\W)/u', $string);
  }

  return $tokens;
}

/**
 * Remove identical prefix for performance improvements.
 */
function atrsimilar_prefix(&$tokens) {
  $prefix = '';
  while ($tokens[0] && $tokens[1] && reset($tokens[0]) === reset($tokens[1])) {
    $prefix .= array_shift($tokens[0]);
    array_shift($tokens[1]);
  }

  return $prefix;
}

/**
 * Remove identical suffix for performance improvements.
 */
function atrsimilar_suffix(&$tokens) {
  $suffix = '';
  while ($tokens[0] && $tokens[1] && end($tokens[0]) === end($tokens[1])) {
    $suffix = array_pop($tokens[0]) . $suffix;
    array_pop($tokens[1]);
  }

  return $suffix;
}

/**
 * Compute an LCS matrix.
 */
function atrsimilar_lcs_matrix($tokens, $length) {
  $matrix = array();

  for ($i = 0; $i < $length[0]; $i++) {
    for ($j = 0; $j < $length[1]; $j++) {
      if ($tokens[0][$i] === $tokens[1][$j]) {
        $matrix[$i][$j] = (isset($matrix[$i - 1][$j - 1]) ? $matrix[$i - 1][$j - 1] : 0) + 1;
      }
      else {
        $matrix[$i][$j] = max(
          isset($matrix[$i][$j - 1]) ? $matrix[$i][$j - 1] : 0,
          isset($matrix[$i - 1][$j]) ? $matrix[$i - 1][$j] : 0
        );
      }
    }
  }

  return $matrix;
}

function atrsimilar_expand_token($string, $common = FALSE) {
  return array(
    'string' => $string,
    'common' => $common,
  );
}

function atrsimilar_diff_build(&$tokens, $matrix, $length) {
  $tokens[0] = array_map('atrsimilar_expand_token', $tokens[0]);
  $tokens[1] = array_map('atrsimilar_expand_token', $tokens[1]);

  // We traverse the matrix starting with the cell in the bottom right corner
  // with coordinates ($length[0] -1, $length[1] -1). After every loop the
  // 'pointer' moves one position up and to the left, which is the desired
  // behaviour if the two tokens for a particular set of coordinates match. If
  // they don't, we modify the next set of coordinates by changing $i and $j.
  for ($i = $length[0] - 1, $j = $length[1] - 1; $i >= 0 && $j >= 0; $i--, $j--) {
    // Check if the current token has changed or not.
    if ($tokens[0][$i]['string'] !== $tokens[1][$j]['string']) {
      $tokens[0][$i]['common'] = $tokens[1][$j]['common'] = FALSE;
      if (($i > 0 && $j > 0 && ($matrix[$i][$j - 1] < $matrix[$i - 1][$j]))) {
        $j++;
      }
      else {
        $i++;
      }
    }
    else {
      // These tokens are common, so mark them.
      $tokens[0][$i]['common'] = $tokens[1][$j]['common'] = TRUE;

    }
  }
}

function atrsimilar_diff_merge($tokens) {
  $diff = array();
  $i = -1;
  foreach ($tokens as $token) {
    if (isset($diff[$i]) && $diff[$i]['common'] == $token['common']) {
      $diff[$i]['string'] .= $token['string'];
    }
    else {
      $i++;
      $diff[$i] = $token;
    }
  }

  return $diff;
}