1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
// Stored procedure: IsItUnique CREATE PROCEDURE [dbo].[IsItUnique] @Subject NVARCHAR(255), @Message NVARCHAR(MAX), @Score FLOAT OUT, @Result BIT OUT AS BEGIN SET NOCOUNT ON; DECLARE @SS INT SET @SS = 80 DECLARE records CURSOR FAST_FORWARD FOR SELECT Id FROM Posts WHERE dbo.SimilarityScore(dbo.CleanString(dbo.CleanString(dbo.CleanString(@Subject, 're:'), 'RE:'), 'Re:'), Subject) >= @SS SET @Result = 1 OPEN records DECLARE @Id UNIQUEIDENTIFIER FETCH NEXT FROM records INTO @Id WHILE @@FETCH_STATUS = 0 BEGIN IF @Result = 1 BEGIN DECLARE @Text NVARCHAR(MAX) SELECT @Text = dbo.PostMessages.Text FROM dbo.PostMessages WHERE dbo.PostMessages.PostId = @Id SET @Score = dbo.SimilarityScore(@Message, @Text) IF @Score >= @SS BEGIN SET @Result = 0 END END FETCH NEXT FROM records INTO @Id END CLOSE records DEALLOCATE records END // Function: SimilarityScore CREATE FUNCTION [dbo].[SimilarityScore] ( @Prospect NVARCHAR(MAX), @Existing NVARCHAR(MAX) ) RETURNS FLOAT AS BEGIN DECLARE @Score FLOAT DECLARE @Diff INT SET @Diff = dbo.DifferenceScore(@Prospect, @Existing) DECLARE @Tokens INT SELECT @Tokens = COUNT(dbo.WordTokens.Item) FROM dbo.WordTokens(@Prospect, DEFAULT, DEFAULT) RETURN ((@Tokens - @Diff) * 100) / @Tokens END // Function: WordTokens CREATE FUNCTION [dbo].[WordTokens] ( @string NVARCHAR(MAX), @WordStartCharacters VARCHAR(255) = 'a-z', @WordCharacters VARCHAR(255) = '-a-z''' ) RETURNS @Results TABLE ( SeqNo INT IDENTITY(1, 1), Item VARCHAR(255), TokenType INT ) AS /* This table function produces a table which divides up the words and the spaces between the words in some text and produces a table of the two types of token in the sequence in which they were found */ BEGIN DECLARE @Pos INT, --index of current search @WhereWeAre INT,--index into string so far @ii INT, --the number of words found so far @next INT, --where the next search starts @size INT --the total size of the text SELECT @ii = 0, @WhereWeAre = 1, @size = DATALENGTH(@string) WHILE @Size >= @WhereWeAre BEGIN SELECT @pos = PATINDEX('%[' + @wordStartCharacters + ']%', SUBSTRING(@string, @whereWeAre, 4000)) IF @pos > 0 BEGIN IF @pos > 1 INSERT INTO @Results ( item, tokentype ) SELECT SUBSTRING(@String, @whereWeAre, @pos - 1), 2 SELECT @next = @WhereWeAre + @pos, @ii = @ii + 1 SELECT @pos = PATINDEX('%[^' + @wordCharacters + ']%', SUBSTRING(@string, @next, 4000) + ' ') INSERT INTO @Results ( item, tokentype ) SELECT SUBSTRING(@String, @next - 1, @pos), 1 SELECT @WhereWeAre = @next + @pos - 1 END ELSE BEGIN IF LEN(REPLACE( SUBSTRING(@String, @whereWeAre, 4000), ' ', '!' )) > 0 INSERT INTO @Results ( item, tokentype ) SELECT SUBSTRING(@String, @whereWeAre, 4000), 2 SELECT @whereWeAre = @WhereWeAre + 4000 END END RETURN END // Function: DifferenceScore CREATE FUNCTION [dbo].[DifferenceScore] ( @Sample NVARCHAR(MAX), @comparison NVARCHAR(MAX) ) RETURNS INT AS BEGIN DECLARE @results TABLE ( token_ID INT IDENTITY(1, 1), sequenceNumber INT, Sample_ID INT, Item VARCHAR(255), TokenType INT ) /* This function returns the number of differences it found between two pieces of text */ INSERT INTO @results ( SequenceNumber, Sample_ID, Item, Tokentype ) SELECT seqno, 1, item, tokentype FROM dbo.WordTokens(@sample, DEFAULT, DEFAULT) INSERT INTO @results ( SequenceNumber, Sample_ID, Item, Tokentype ) SELECT seqno, 2, item, tokentype FROM dbo.WordTokens(@comparison, DEFAULT, DEFAULT) DECLARE @closestMatch TABLE ( sequenceNumber INT, skew INT ) INSERT INTO @closestMatch ( sequencenumber, skew ) SELECT COALESCE(a.sequencenumber, b.sequencenumber), COALESCE(MIN(ABS(COALESCE(b.sequenceNumber, 1000) - COALESCE(a.sequencenumber, 1000))), -1) FROM ( SELECT * FROM @results WHERE sample_ID = 1 AND tokentype = 1 ) a FULL OUTER JOIN ( SELECT * FROM @results WHERE sample_ID = 2 AND tokentype = 1 ) b ON a.item = b.item GROUP BY COALESCE(a.sequencenumber, b.sequencenumber) ORDER BY COALESCE(a.sequencenumber, b.sequencenumber) RETURN ( SELECT SUM(CASE WHEN a.skew - b.skew = 0 THEN 0 ELSE 1 END) FROM @closestmatch a INNER JOIN @closestMatch b ON b.sequenceNumber = a.sequenceNumber + 2 ) END // CLR Function: CleanString public partial class UserDefinedFunctions { [SqlFunction] public static SqlString CleanString(SqlString Value, SqlString Token) { return Value.ToString().Replace(Token.ToString(), "").Trim(); } };
Refactorings
No refactoring yet !
First of all I apologize for posting this in C# it should be in SQL (which I am not able to find).
What I am trying to do here is to calculate similarity between two piece of text via a stored procedure.