String precision/distance/percentage equality check

This may be a little hard to explain but, for the heck of it I decided to look up code that precisely compares two strings and returns a distance percentage between those strings. Here is the problem: I am trying to take two strings and precisely compare them using precision equality, for example:

String 1 = "Test string"
String 2 = "Test"

Now, by precision standards these strings distance from each other would equal to be 50%. I am trying to do that with ALL strings.

The best way I can describe what I am trying to accomplish is: taking two strings and seeing how far away (in distance) they are from each other (like the quoted example). I know that there is a Double method for this in swift called the Jaro Winkler Distance calculation but that is where the problem comes in, I implement the Jaro Winkler solution that I found online into my project but, when I try to compare these two strings:

String 1 = "This is a test string of text - test string"
String 2 = "This is a test string of text - test string"

I get that the total distance between those strings is: 93.48837% when in fact, the strings are identical and should give me the resulting distance of 100%. Am I missing something because if I input "Test" into field 1 and "Test" into field two, I get a result of "100%" as the distance comparison output.

Here is the code I found online, it is an extension of string
Code Block
extension String {
    public func distanceJaroWinkler(between target: String) -> Double {
            var stringOne = self
            var stringTwo = target
            if stringOne.count > stringTwo.count {
                stringTwo = self
                stringOne = target
            }
            let stringOneCount = stringOne.count
            let stringTwoCount = stringTwo.count
            if stringOneCount == 0 && stringTwoCount == 0 {
                return 1.0
            }
            let matchingDistance = stringTwoCount / 2
            var matchingCharactersCount: Double = 0
            var transpositionsCount: Double = 0
            var previousPosition = -1
            // Count matching characters and transpositions.
            for (i, stringOneChar) in stringOne.enumerated() {
                for (j, stringTwoChar) in stringTwo.enumerated() {
                    if max(0, i - matchingDistance)..<min(stringTwoCount, i + matchingDistance) ~= j {
                        if stringOneChar == stringTwoChar {
                            matchingCharactersCount += 1
                            if previousPosition != -1 && j < previousPosition {
                                transpositionsCount += 1
                            }
                            previousPosition = j
                            break
                        }
                    }
                }
            }
            if matchingCharactersCount == 0.0 {
                return 0.0
            }
            // Count common prefix (up to a maximum of 4 characters)
            let commonPrefixCount = min(max(Double(self.commonPrefix(with: target).count), 0), 4)
            let jaroSimilarity = (matchingCharactersCount / Double(stringOneCount) + matchingCharactersCount / Double(stringTwoCount) + (matchingCharactersCount - transpositionsCount) / matchingCharactersCount) / 3
            // Default is 0.1, should never exceed 0.25 (otherwise similarity score could exceed 1.0)
            let commonPrefixScalingFactor = 0.1
            return jaroSimilarity + commonPrefixCount * commonPrefixScalingFactor * (1 - jaroSimilarity)
        }
}



Answered by OOPer in 638709022
I do not know much about  Jaro Winkler Distance, but as far as I studied with wiki or some site showing sample code, the transpositionsCount calculated by your code seems to be wrong.

Your code generates transpositionsCount as 14.0 for input ("This is a test string of text - test string", "This is a test string of text - test string"), but it should be 0.0 for exactly identical strings.

I'm not sure, but calculating matchingCharactersCount and transpositionsCount in the same loop seems to be very difficult and your code has some flaw in it.

The following is the code converted from the C# code in a site (seems the dev forums does not like the URL, please search with "jaro winkler similarity calculator").
Code Block
// Function to calculate the
// Jaro Similarity of two strings
func jaroDistance(_ s1: String, _ s2: String) -> Double {
// If the strings are equal
//if s1 == s2 {
// return 1.0
//}
// Length of two strings
let len1 = s1.count,
len2 = s2.count
//
if len1 == 0 || len2 == 0 {
return 0.0
}
// Maximum distance upto which matching
// is allowed
let maxDist = max(len1, len2) / 2 - 1
// Count of matches
var match = 0
// Hash for matches
var hashS1: [Int] = Array(repeating: 0, count: s1.count)
var hashS2: [Int] = Array(repeating: 0, count: s2.count)
let s2Array = Array(s2)
// Traverse through the first string
for (i, ch1) in s1.enumerated() {
// Check if there is any matches
if max(0, i - maxDist) > min(len2 - 1, i + maxDist) {
continue
}
for j in max(0, i - maxDist)...min(len2 - 1, i + maxDist) {
// If there is a match
if ch1 == s2Array[j] &&
hashS2[j] == 0 {
hashS1[i] = 1
hashS2[j] = 1
match += 1
break
}
}
}
// If there is no match
if match == 0 {
return 0.0
}
// Number of transpositions
var t: Double = 0
var point = 0
// Count number of occurances
// where two characters match but
// there is a third matched character
// in between the indices
for (i, ch1) in s1.enumerated() {
if hashS1[i] == 1 {
// Find the next matched character
// in second string
while hashS2[point] == 0 {
point += 1
}
if ch1 != s2Array[point] {
t += 1
}
point += 1
}
}
t /= 2
print(s1.count, s2.count, match, t)
// Return the Jaro Similarity
return (Double(match) / Double(len1)
+ Double(match) / Double(len2)
+ (Double(match) - t) / Double(match))
/ 3.0
}
// Jaro Winkler Similarity
func jaroWinkler(_ s1: String, _ s2: String) -> Double {
var jaroDist = jaroDistance(s1, s2)
print("Jaro Similarity =", jaroDist)
// If the jaro Similarity is above a threshold
if jaroDist > 0.7 {
// Find the length of common prefix
let prefixStr = s1.commonPrefix(with: s2)
// Maximum of 4 characters are allowed in prefix
let prefix = Double(min(4, prefixStr.count))
// Calculate jaro winkler Similarity
jaroDist += 0.1 * prefix * (1 - jaroDist)
}
return jaroDist
}

Please compare the values of match and t with the values of your code matchingCharactersCount and transpositionsCount.
Accepted Answer
I do not know much about  Jaro Winkler Distance, but as far as I studied with wiki or some site showing sample code, the transpositionsCount calculated by your code seems to be wrong.

Your code generates transpositionsCount as 14.0 for input ("This is a test string of text - test string", "This is a test string of text - test string"), but it should be 0.0 for exactly identical strings.

I'm not sure, but calculating matchingCharactersCount and transpositionsCount in the same loop seems to be very difficult and your code has some flaw in it.

The following is the code converted from the C# code in a site (seems the dev forums does not like the URL, please search with "jaro winkler similarity calculator").
Code Block
// Function to calculate the
// Jaro Similarity of two strings
func jaroDistance(_ s1: String, _ s2: String) -> Double {
// If the strings are equal
//if s1 == s2 {
// return 1.0
//}
// Length of two strings
let len1 = s1.count,
len2 = s2.count
//
if len1 == 0 || len2 == 0 {
return 0.0
}
// Maximum distance upto which matching
// is allowed
let maxDist = max(len1, len2) / 2 - 1
// Count of matches
var match = 0
// Hash for matches
var hashS1: [Int] = Array(repeating: 0, count: s1.count)
var hashS2: [Int] = Array(repeating: 0, count: s2.count)
let s2Array = Array(s2)
// Traverse through the first string
for (i, ch1) in s1.enumerated() {
// Check if there is any matches
if max(0, i - maxDist) > min(len2 - 1, i + maxDist) {
continue
}
for j in max(0, i - maxDist)...min(len2 - 1, i + maxDist) {
// If there is a match
if ch1 == s2Array[j] &&
hashS2[j] == 0 {
hashS1[i] = 1
hashS2[j] = 1
match += 1
break
}
}
}
// If there is no match
if match == 0 {
return 0.0
}
// Number of transpositions
var t: Double = 0
var point = 0
// Count number of occurances
// where two characters match but
// there is a third matched character
// in between the indices
for (i, ch1) in s1.enumerated() {
if hashS1[i] == 1 {
// Find the next matched character
// in second string
while hashS2[point] == 0 {
point += 1
}
if ch1 != s2Array[point] {
t += 1
}
point += 1
}
}
t /= 2
print(s1.count, s2.count, match, t)
// Return the Jaro Similarity
return (Double(match) / Double(len1)
+ Double(match) / Double(len2)
+ (Double(match) - t) / Double(match))
/ 3.0
}
// Jaro Winkler Similarity
func jaroWinkler(_ s1: String, _ s2: String) -> Double {
var jaroDist = jaroDistance(s1, s2)
print("Jaro Similarity =", jaroDist)
// If the jaro Similarity is above a threshold
if jaroDist > 0.7 {
// Find the length of common prefix
let prefixStr = s1.commonPrefix(with: s2)
// Maximum of 4 characters are allowed in prefix
let prefix = Double(min(4, prefixStr.count))
// Calculate jaro winkler Similarity
jaroDist += 0.1 * prefix * (1 - jaroDist)
}
return jaroDist
}

Please compare the values of match and t with the values of your code matchingCharactersCount and transpositionsCount.
String precision/distance/percentage equality check
 
 
Q