Psytherium
06-21-2007, 06:32 PM
Hello,
I have a database of about 16,000 apartments that are for the most part user generated entries. Now I have another database of apartments that I have purchased of about 115,000 apartments. I want to add in these new entries to the old one, but I am concerned about duplicates. As you can imagine with the original content being user generated, there are some typos, non-official names, etc. I've generated this script to try and identify duplicates:
<?php
$states = array("'AK'", "'AL'", "'AR'", "'AZ'", "'CA'", "'CO'", "'CT'", "'DC'", "'DE'", "'FL'", "'GA'", "'HI'", "'IA'", "'ID'", "'IL'", "'IN'", "'KS'", "'KY'", "'LA'", "'MA'", "'MD'", "'ME'", "'MI'", "'MN'", "'MO'", "'MS'", "'MT'", "'NC'", "'ND'", "'NE'", "'NH'", "'NJ'", "'NM'", "'NV'", "'NY'", "'OH'", "'OK'", "'OR'", "'PA'", "'PR'", "'RI'", "'SC'", "'SD'", "'TN'", "'TX'", "'UT'", "'VA'", "'VT'", "'WA'", "'WI'", "'WV'", "'WY'");
$alpha = array("'a'", "'b'", "'c'", "'d'", "'e'", "'f'", "'g'", "'h'", "'i'", "'j'", "'k'", "'l'", "'m'", "'n'", "'o'", "'p'", "'q'", "'r'", "'s'", "'t'", "'u'", "'v'", "'w'", "'x'", "'y'", "'z'");
$db = mysql_connect ('localhost', '******', '********')
or die ("Unable to connect to Database Server");
mysql_select_db ('dev01', $db)
or die ("Could not select database");
$start = microtime(true);
$count = 0;
for($i = 0; $i < 50; $i++)
{
for( $j = 0; $j < 26; $j++)
{
$query = "SELECT apartmentname, apartmentcity, apartmentstate FROM apartmentinfo WHERE apartmentstate = {$states[$i]} AND LEFT(apartmentname,1) like {$alpha[$j]}";
$result = mysql_query($query, $db) // WHERE LEFT(apartmentname,1) like 'z' LIMIT
or die("Invalid query: " . mysql_error());
$num=mysql_numrows($result);
$array = array( array() );
for( $k=0; $k< $num; $k++ ) {
$array[$k] = mysql_fetch_array( $result, MYSQL_ASSOC );
}
for( $k=0; $k< $num-1; $k++ ) {
for( $l=$k+1; $l< $num; $l++ ) {
if($array[$k]['apartmentstate'] == $array[$l]['apartmentstate']
&& $array[$k]['apartmentcity'] == $array[$l]['apartmentcity'])
{
$percent = 0.0;
similar_text($array[$k]['apartmentname'], $array[$l]['apartmentname'], $percent);
if($percent > 80)
$count++;
}
}
}
}
}
echo $count;
echo "<br />";
echo (microtime(true) - $start);
?>
Am I on the right track, or should I go about this differently? I would love to make this as accurate as possible so I can automate some of the elimination of duplicates, or else I will just have to settle with flagging possible duplicates and changing them myself.
Thanks!
I have a database of about 16,000 apartments that are for the most part user generated entries. Now I have another database of apartments that I have purchased of about 115,000 apartments. I want to add in these new entries to the old one, but I am concerned about duplicates. As you can imagine with the original content being user generated, there are some typos, non-official names, etc. I've generated this script to try and identify duplicates:
<?php
$states = array("'AK'", "'AL'", "'AR'", "'AZ'", "'CA'", "'CO'", "'CT'", "'DC'", "'DE'", "'FL'", "'GA'", "'HI'", "'IA'", "'ID'", "'IL'", "'IN'", "'KS'", "'KY'", "'LA'", "'MA'", "'MD'", "'ME'", "'MI'", "'MN'", "'MO'", "'MS'", "'MT'", "'NC'", "'ND'", "'NE'", "'NH'", "'NJ'", "'NM'", "'NV'", "'NY'", "'OH'", "'OK'", "'OR'", "'PA'", "'PR'", "'RI'", "'SC'", "'SD'", "'TN'", "'TX'", "'UT'", "'VA'", "'VT'", "'WA'", "'WI'", "'WV'", "'WY'");
$alpha = array("'a'", "'b'", "'c'", "'d'", "'e'", "'f'", "'g'", "'h'", "'i'", "'j'", "'k'", "'l'", "'m'", "'n'", "'o'", "'p'", "'q'", "'r'", "'s'", "'t'", "'u'", "'v'", "'w'", "'x'", "'y'", "'z'");
$db = mysql_connect ('localhost', '******', '********')
or die ("Unable to connect to Database Server");
mysql_select_db ('dev01', $db)
or die ("Could not select database");
$start = microtime(true);
$count = 0;
for($i = 0; $i < 50; $i++)
{
for( $j = 0; $j < 26; $j++)
{
$query = "SELECT apartmentname, apartmentcity, apartmentstate FROM apartmentinfo WHERE apartmentstate = {$states[$i]} AND LEFT(apartmentname,1) like {$alpha[$j]}";
$result = mysql_query($query, $db) // WHERE LEFT(apartmentname,1) like 'z' LIMIT
or die("Invalid query: " . mysql_error());
$num=mysql_numrows($result);
$array = array( array() );
for( $k=0; $k< $num; $k++ ) {
$array[$k] = mysql_fetch_array( $result, MYSQL_ASSOC );
}
for( $k=0; $k< $num-1; $k++ ) {
for( $l=$k+1; $l< $num; $l++ ) {
if($array[$k]['apartmentstate'] == $array[$l]['apartmentstate']
&& $array[$k]['apartmentcity'] == $array[$l]['apartmentcity'])
{
$percent = 0.0;
similar_text($array[$k]['apartmentname'], $array[$l]['apartmentname'], $percent);
if($percent > 80)
$count++;
}
}
}
}
}
echo $count;
echo "<br />";
echo (microtime(true) - $start);
?>
Am I on the right track, or should I go about this differently? I would love to make this as accurate as possible so I can automate some of the elimination of duplicates, or else I will just have to settle with flagging possible duplicates and changing them myself.
Thanks!