By using db<>fiddle, you agree to license everything you submit by Creative Commons CC0.
-- Using CHAR(integer-value) and ASCII(character-value).
-- The @cv0 and @cv1 variables have been changed from varbinary(max) to varchar(max).
CREATE FUNCTION fn_FuzzyMatch1(
@str1 varchar(max),
@str2 varchar(max) )
RETURNS int
AS
BEGIN
DECLARE
@str1_len int,
@str2_len int,
@i int,
@j int,
@c int,
@c_temp int,
@cost int,
@str1_char char,
--@cv0 varbinary(max),
--@cv1 varbinary(max);
@cv0 varchar(max),
@cv1 varchar(max);
SELECT
@str1_len = LEN(@Str1), --SDU_Tools.StringLength(@str1),
@str2_len = LEN(@Str2), --SDU_Tools.StringLength(@str2),
@cv1 = 0x00,
@j = 1,
@i = 1,
@c = 0;
IF @str1 = @str2
RETURN 0;
ELSE IF @str1_len = 0
RETURN @str2_len;
ELSE IF @str2_len = 0
RETURN @str1_len;
-- Using CONVERT(BINARY(1), integer-value) and CONVERT(INT, binary-value).
CREATE FUNCTION fn_FuzzyMatch2(
@str1 varchar(max),
@str2 varchar(max) )
RETURNS int
AS
BEGIN
DECLARE
@str1_len int,
@str2_len int,
@i int,
@j int,
@c int,
@c_temp int,
@cost int,
@str1_char char,
@cv0 varbinary(max),
@cv1 varbinary(max);
SELECT
@str1_len = LEN(@Str1), --SDU_Tools.StringLength(@str1),
@str2_len = LEN(@Str2), --SDU_Tools.StringLength(@str2),
@cv1 = 0x00,
@j = 1,
@i = 1,
@c = 0;
IF @str1 = @str2
RETURN 0;
ELSE IF @str1_len = 0
RETURN @str2_len;
ELSE IF @str2_len = 0
RETURN @str1_len;
ELSE
BEGIN
WHILE @j <= @str2_len
-- =============================================
-- Computes and returns the Levenshtein edit distance between two strings, i.e. the
-- number of insertion, deletion, and sustitution edits required to transform one
-- string to the other, or NULL if @max is exceeded. Comparisons use the case-
-- sensitivity configured in SQL Server (case-insensitive by default).
--
-- Based on Sten Hjelmqvist's "Fast, memory efficient" algorithm, described
-- at http://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm,
-- with some additional optimizations.
-- =============================================
CREATE FUNCTION [dbo].[Levenshtein](
@s nvarchar(4000)
, @t nvarchar(4000)
, @max int
)
RETURNS int
WITH SCHEMABINDING
AS
BEGIN
DECLARE @distance int = 0 -- return variable
, @v0 nvarchar(4000)-- running scratchpad for storing computed distances
, @start int = 1 -- index (1 based) of first non-matching character between the two string
, @i int, @j int -- loop counters: i for s string and j for t string
, @diag int -- distance in cell diagonally above and left if we were using an m by n matrix
, @left int -- distance in cell to the left if we were using an m by n matrix
, @sChar nchar -- character at index i from s string
, @thisJ int -- temporary storage of @j to allow SELECT combining
, @jOffset int -- offset used to calculate starting value for j loop
, @jEnd int -- ending value for j loop (stopping point for processing a column)
-- get input string lengths including any trailing spaces (which SQL Server would otherwise ignore)
, @sLen int = datalength(@s) / datalength(left(left(@s, 1) + '.', 1)) -- length of smaller string
, @tLen int = datalength(@t) / datalength(left(left(@t, 1) + '.', 1)) -- length of larger string
, @lenDiff int -- difference in length between the two strings
-- if strings of different lengths, ensure shorter string is in s. This can result in a little
-- faster speed by spending more time spinning just the inner loop during the main processing.
IF (@sLen > @tLen) BEGIN
CREATE FUNCTION edit_distance(@s1 nvarchar(3999), @s2 nvarchar(3999))
RETURNS int
AS
BEGIN
DECLARE @s1_len int, @s2_len int
DECLARE @i int, @j int, @s1_char nchar, @c int, @c_temp int
DECLARE @cv0 varbinary(8000), @cv1 varbinary(8000)
SELECT
@s1_len = LEN(@s1),
@s2_len = LEN(@s2),
@cv1 = 0x0000,
@j = 1, @i = 1, @c = 0
WHILE @j <= @s2_len
SELECT @cv1 = @cv1 + CAST(@j AS binary(2)), @j = @j + 1
WHILE @i <= @s1_len
BEGIN
SELECT
@s1_char = SUBSTRING(@s1, @i, 1),
@c = @i,
@cv0 = CAST(@i AS binary(2)),
@j = 1
WHILE @j <= @s2_len
BEGIN
SET @c = @c + 1
SET @c_temp = CAST(SUBSTRING(@cv1, @j+@j-1, 2) AS int) +
CASE WHEN @s1_char = SUBSTRING(@s2, @j, 1) THEN 0 ELSE 1 END
IF @c > @c_temp SET @c = @c_temp
SET @c_temp = CAST(SUBSTRING(@cv1, @j+@j+1, 2) AS int)+1
IF @c > @c_temp SET @c = @c_temp
SELECT @cv0 = @cv0 + CAST(@c AS binary(2)), @j = @j + 1
END
SELECT
String1, String2,
dbo.fn_FuzzyMatch1(String1, String2) AS fn_FuzzyMatch1,
dbo.fn_FuzzyMatch2(String1, String2) AS fn_FuzzyMatch2,
dbo.Levenshtein(String1, String2, 99) AS Levenshtein,
dbo.edit_distance(String1, String2) AS edit_distance
FROM (
VALUES
('String1', 'String2'),
('Gigantic', 'Titanic'),
('Apples', 'Oranges')
) D(String1, String2)
String1 | String2 | fn_FuzzyMatch1 | fn_FuzzyMatch2 | Levenshtein | edit_distance |
---|---|---|---|---|---|
String1 | String2 | 1 | 1 | 1 | 1 |
Gigantic | Titanic | 3 | 3 | 3 | 3 |
Apples | Oranges | 5 | 5 | 5 | 5 |