atrsimilar_review

Versions
6.1
atrsimilar_review($review, $profile, &$context)

Find (almost) duplicate strings.

Parameters

$profile The settings profile used for this review.

$context A batch operation context.

Code

atr/modules/atrsimilar/includes/atrsimilar.review.inc, line 16

<?php
function atrsimilar_review($review, $profile, &$context) {
  // Get and set values necessary for this operation.
  if (!isset($context['sandbox']['from'])) {
    $context['sandbox']['from'] = 0;
    $context['sandbox']['count'] = db_result(db_query("SELECT COUNT(*) FROM {atr_string} WHERE rid = %d", $review->rid));
  }
  $from = $context['sandbox']['from'];
  $count = $context['sandbox']['count'];

  $context['message'] = t('Checked @done of @total strings for similarity.', array('@done' => $from, '@total' => $count));

  $strings = $sids = array();
  $result = db_query_range("SELECT sid, string FROM {atr_string} WHERE rid = %d", $review->rid, $from, $count);
  while ($string_data = db_fetch_object($result)) {
    $strings[] = $string_data->string;
    $sids[] = $string_data->sid;
  }
  // The string count of this run only.
  $count_run = count($strings);

  // Calculate how many strings we're going to compare during this run.
  // Try the first string at least.
  $i_max = 0;
  // The amount of comparisons we're going to do during this run.
  $comparisons = $count_run - 1;
  // Do a maximum of 10,000 comparisons per run.
  $comparisons_per_run = 10000;
  // The last string has been compared to all previous strings already.
  while ($i_max < $count_run - 2) {
    // The amount of comparisons required for another string during this run.
    $comparisons_next_string = $count_run - $i_max - 2;
    if ($comparisons + $comparisons_next_string < $comparisons_per_run) {
      $comparisons += $comparisons_next_string;
      $i_max++;
    }
    else {
      break;
    }
  }

  $values = array();
  $threshold = (int) variable_get('atrsimilar_profile_' . $profile->pid . '_threshold', 90);
  for ($i = 0; $i <= $i_max; $i++) {
    for ($j = $i + 1; $j < $count_run; $j++) {
      $similarity = atrsimilar_similarity($strings[$i], $strings[$j]);
      if ($similarity > $threshold) {
        $values[] = $sids[$i];
        $values[] = $sids[$j];
        $values[] = $similarity;
      }
    }
  }
  // Save matches to the database.
  if ($matches = count($values) / 3) {
    $placeholders = implode(',', array_fill(0, $matches, '(%d, %d, %d)'));
    db_query("INSERT INTO {atrsimilar_string} VALUES " . $placeholders, $values);
  }

  // Inform Batch API we are not yet finished and provide an estimation of the
  // completion level we reached.
  if ($i_max < $count_run - 2) {
    $context['sandbox']['from'] = $from + $i_max + 1;
    $context['finished'] = $from / $count;
  }
}
?>