/*
* diff -- computes the difference between two strings
* (the minimal number of changes needed to make one equal the other where
* a change is one of the following:
* - replacing a character by another one
* - inserting a character
* - deleting a character
* - swapping two consecutive characters)
*
* by Alfe for TubMud 99-Mar-10
* original idea and a first LPC implementation by Ugh
*
* Notes about the implementation:
* The algorithm conducts an exhaustive search of a tree, which nodes
* consists of the tuple (pos in string a, pos in string b, difference
* computed while advancing the positions so far). Every node has one
* to four children:
* If the characters at the positions match: a new tuple (a+1, b+1, diff).
* If the characters at the positions don't match:
* 1. Always: new tuple (a+1, b+1, diff+1)
* 2. Characters at the position and the next are swapped,
* new tuple (a+2, b+2, diff+1)
* 3. Always: Possible insertion in a, new tuple (a+1, b, diff+1)
* 4. Always: Possible insertion in b, new tuple (a, b+1, diff+1)
*
* The implementation exploits that only the 'diff' is required for the
* result, which allows to keep all tuples in a single list instead
* of explicitely constructing a tree. The list is sorted by the 'diff'
* value of the tuples in increasing order. Since the construction of
* the tree always creates the children for the currently first tuple
* in the list (which is removed by this), the effect for the tree building
* is that always the currently most promising branch of all is searched
* first - it is impossible to get stuck in a local optimum.
*
* The disadvantage is the possible high memory usage: for two strings
* with pairwise swapped characters (e.g. 'abcdef' and 'badcfe') I
* guesstimate O(3**N) created nodes (N being the length of the strings)
* and computation costs of O(2**N). I better dig out the functions
* I wrote in a former life for this problem...
*/
#define DEBUG
#ifdef DEBUG
# define _D_ printf("%s %d\n",__FILE__,__LINE__)
# define _D(x) printf x
int global_debug_counter = 0;
#else
# define _D_
# define _D(x)
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int diff(char *,char *,int);
void cleanup_diff_pool();
void main(int argc,char *argv[]) {
if (argc != 4) {
printf("usage: %s <arg1> <arg2> <max>\n",argv[0]);
exit(1);
}
printf("%d\n",diff(argv[1],argv[2],atoi(argv[3])));
/*
* cleanup_diff_pool should be called from time to time when memory
* is needed. to call it here is just one solution (especially in
* combination with debug output to check that all entries are freed
* again)
*/
cleanup_diff_pool();
}
/*
* an entry in the list of found positions
* a -- position in first string
* b -- position in second string
* d -- difference at that point
*/
typedef struct entry {
struct entry *next;
int a,b,d;
} entry;
/*
* this is a pool of disposed entries which are reused to reduce memory
* management overhead.
*/
entry *diff_pool = NULL;
/*
* make a new entry, fill it with the given values, return it.
* this function could well use a pool of old disposed entries instead
* of calling malloc() to create a new entry.
*/
entry *make_entry(int a,int b,int d) {
entry *e;
#define USE_DISPOSED_POOL
#ifndef USE_DISPOSED_POOL
e = malloc(sizeof(entry));
_D(("mem %p %5d %4d %4d %3d malloc\n",e,global_debug_counter++,a,b,d));
#else
if (diff_pool) {
e = diff_pool;
diff_pool = diff_pool->next;
_D(("mem %p %5d %4d %4d %3d reuse\n",e,global_debug_counter++,a,b,d));
}
else {
e = malloc(sizeof(entry));
_D(("mem %p %5d %4d %4d %3d malloc\n",e,global_debug_counter++,a,b,d));
}
#endif
e->a = a;
e->b = b;
e->d = d;
return e;
}
/*
* dispose the given entry (e.g. free it); this function could well put
* the entry into a pool of disposed entries to reduce memory management
* overhead.
*/
void dispose_entry(entry **ep) {
#ifndef USE_DISPOSED_POOL
_D(("mem %p %5d %4d %4d %3d free\n",(*ep),global_debug_counter++,
(*ep)->a,(*ep)->b,(*ep)->d));
free(*ep);
#else
_D(("mem %p %5d %4d %4d %3d dispose\n",(*ep),global_debug_counter++,
(*ep)->a,(*ep)->b,(*ep)->d));
(*ep)->next = diff_pool;
diff_pool = (*ep);
#endif
(*ep) = NULL;
}
/*
* this should cleanup the memory, i.e. free the pool of old disposed entries
* if any exists. it should be called from time to time in a running system,
* but not too often to reduce memory management overhead.
*/
void cleanup_diff_pool() {
int counter;
entry *walker,*e;
walker = diff_pool;
for (counter=0; walker; counter++) {
e = walker;
walker = walker->next;
_D(("mem %p %5d %4d %4d %3d free\n",e,global_debug_counter++,
e->a,e->b,e->d));
free(e);
}
diff_pool = NULL;
_D(("info %d\n",counter));
}
/*
* insert the given entry into the given list at the appropriate place.
*/
void insert_entry(entry *e,entry **startp) {
entry *walker;
if (!(*startp) || // list empty?
e->d < (*startp)->d) { // less than the start entry?
e->next = (*startp);
(*startp) = e;
return;
}
walker = (*startp);
while (walker->next && e->d >= walker->next->d) {
if (walker->a == e->a &&
walker->b == e->b) { // a shorter way to this pos already in list?
dispose_entry(&e);
return;
}
walker = walker->next;
}
// normal case: found place to insert entry
e->next = walker->next;
walker->next = e;
}
/*
* use the first entry of the given queue and produce children of it; add
* them to the list of entries at the appropriate places.
*/
void add_new_entries(char *a,char *b,entry **startp) {
entry *old;
int ap,bp,d;
ap = (*startp)->a;
bp = (*startp)->b;
d = (*startp)->d;
// remove first from list (we will process it now)
old = (*startp);
(*startp) = (*startp)->next;
dispose_entry(&old);
if (a[ap] && b[bp]) { // both strings not at their end?
// create usual child (both strings one further)
if (a[ap] == b[bp]) { // chars equal?
insert_entry(make_entry(ap+1,bp+1,d),startp);
return;
}
else { // not equal, counts as one difference
insert_entry(make_entry(ap+1,bp+1,d+1),startp);
// check for swapped letters: one diff
if (a[ap+1] && b[bp+1]) // both strings still have one more char?
if (a[ap] == b[bp+1] &&
b[bp] == a[ap+1])
insert_entry(make_entry(ap+2,bp+2,d+1),startp);
}
}
if (a[ap]) { // first not at its end?
// create difference child (assume missing letter in second: one diff)
insert_entry(make_entry(ap+1,bp,d+1),startp);
}
if (b[bp]) { // second not at its end?
// create difference child (assume missing letter in first: one diff)
insert_entry(make_entry(ap,bp+1,d+1),startp);
}
}
int diff(char *a,char *b,int max) {
char *s;
int al,bl,h;
entry *e,*start;
al = strlen(a);
bl = strlen(b);
start = make_entry(0,0,0); // start value: at pos (0,0) difference 0
while (start->a < al ||
start->b < bl) {
if (max > 0 && start->d >= max) // maximum reached?
break;
add_new_entries(a,b,&start);
}
h = start->d;
while (start) {
e = start;
start = start->next;
dispose_entry(&e);
}
return h;
}