The following piece of code, reads a list of problematic strings and tries to match them using a list of valid codes.
In order to get the best match there is a scoring for each matching letter, taking in account also the position of the matching character.
The size of the term compared is taken under consideration as well, because it is logical that longer terms have more letters, therefore better scoring.
The results are very satisfactory.
declare
term_size1 number;
term_size2 number;
max_relevant_chars number := 0;
old_max_relevant_chars number := 0;
best_match varchar2(500);
relational_position number;
begin
for i in (
select supplier_id, supplier_code from suppliers where trim(supplier_code) not in
(select supplier_code from official_suppliers_list)
order by trim(supplier_code)) loop
term_size1 := length(i.supplier_code);
best_match := '';
old_max_relevant_chars := 0;
for j in (select supplier_code from official_suppliers_list) loop
term_size2 := length(j.supplier_code);
max_relevant_chars := 0;
for x in 1..term_size1 loop
for y in 1..term_size2 loop
if(upper(substr(i.supplier_code,x,1)) = upper(substr(j.supplier_code,y,1))) then
IF(X-Y)=0 THEN
relational_position := 1;
ELSE
relational_position := ABS((term_size2/term_size1)/(X-Y));
END IF;
max_relevant_chars := max_relevant_chars +
1*(term_size1/term_size2)*relational_position;
exit;
end if;
end loop;
end loop;
if(old_max_relevant_chars < max_relevant_chars)then
best_match := j.supplier_code;
old_max_relevant_chars := max_relevant_chars;
end if;
end loop;
dbms_output.put_line('update suppliers set supplier_code=''' || best_match|| '''
where supplier_id='||i.supplier_id||' and supplier_code='''||i.supplier_code || ''';');
end loop;
end;