Weights-based method

The weights-based algorithm for record linkage:

library(RecordLinkage)
data("RLdata500")

# weight calculation
rec.pairs <- compare.dedup(RLdata500
,blockfld = list(1, 5:7)
,strcmp = c(2,3,4)
,strcmpfun = levenshteinSim)

pairs.weights <- epiWeights(rec.pairs)
hist(pairs.weights$Wdata)

summary(pairs.weights)

weights.df<-getPairs(pairs.weights)
head(weights.df)

# Classification
pairs.classify <- emClassify(pairs.weights, threshold.upper = 0.5, threshold.lower = 0.3)

# View the matches
final.results <- pairs.classify$pairs
final.results$weight <- pairs.classify$Wdata
final.results$links <- pairs.classify$prediction
head(final.results)

counts <- table(final.results$links)
barplot(counts, main="Link Distribution",
xlab="Link Types")

# Final output to our customer
weights.df.srow <-getPairs( pairs.weights, single.rows = TRUE)
final.matches <- final.results[final.results$links == 'L',]

final <- merge(final.matches, weights.df.srow)
final <- subset(final, select = -c(fname_c1.2, fname_c2.2, lname_c1.2, lname_c2.2, by.2, bm.2, bd.2, weight))
head(final)

This completes the weights-based method.

..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset