42d18a359135982c319658d02c826bfb

First of all I apologize for posting this in C# it should be in SQL (which I am not able to find).

What I am trying to do here is to calculate similarity between two piece of text via a stored procedure.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
// Stored procedure: IsItUnique

CREATE PROCEDURE [dbo].[IsItUnique]
	@Subject NVARCHAR(255),
	@Message NVARCHAR(MAX),
	@Score FLOAT OUT,
	@Result BIT OUT
AS
BEGIN
	SET NOCOUNT ON;

	DECLARE @SS INT
	SET @SS = 80
	
	DECLARE records CURSOR FAST_FORWARD FOR SELECT Id FROM Posts WHERE dbo.SimilarityScore(dbo.CleanString(dbo.CleanString(dbo.CleanString(@Subject, 're:'), 'RE:'), 'Re:'), Subject) >= @SS

	SET @Result = 1
	OPEN records
		DECLARE @Id UNIQUEIDENTIFIER
		
		FETCH NEXT FROM records INTO @Id
		
		WHILE @@FETCH_STATUS = 0
		BEGIN
			
			IF @Result = 1
			BEGIN
				DECLARE @Text NVARCHAR(MAX)
				SELECT @Text = dbo.PostMessages.Text FROM dbo.PostMessages WHERE dbo.PostMessages.PostId = @Id
				
				SET @Score = dbo.SimilarityScore(@Message, @Text)
				
				IF @Score >= @SS
				BEGIN
					SET @Result = 0
				END
			END
			
			FETCH NEXT FROM records INTO @Id
		END

	CLOSE records
	DEALLOCATE records
END

// Function: SimilarityScore

CREATE FUNCTION [dbo].[SimilarityScore]
(
	@Prospect NVARCHAR(MAX),
	@Existing NVARCHAR(MAX)
)
RETURNS FLOAT
AS
BEGIN
	DECLARE @Score FLOAT

	DECLARE @Diff INT
	SET @Diff = dbo.DifferenceScore(@Prospect, @Existing)
	
	DECLARE @Tokens INT
	SELECT @Tokens = COUNT(dbo.WordTokens.Item) FROM dbo.WordTokens(@Prospect, DEFAULT, DEFAULT)

	RETURN ((@Tokens - @Diff) * 100) / @Tokens

END

// Function: WordTokens

CREATE FUNCTION [dbo].[WordTokens]
  (
    @string NVARCHAR(MAX),
    @WordStartCharacters VARCHAR(255) = 'a-z',
    @WordCharacters VARCHAR(255) = '-a-z'''
  )
RETURNS @Results TABLE
  (
    SeqNo INT IDENTITY(1, 1),
    Item VARCHAR(255),
    TokenType INT
  )
AS /*
This table function produces a table which divides up the words and 
the spaces between the words in some text and produces a table of the
two types of token in the sequence in which they were found
*/
   BEGIN
    DECLARE @Pos INT,    --index of current search
      @WhereWeAre INT,--index into string so far
      @ii INT,    --the number of words found so far
      @next INT,  --where the next search starts 
      @size INT   --the total size of the text

    SELECT  @ii = 0, @WhereWeAre = 1, @size = DATALENGTH(@string)


    WHILE @Size >= @WhereWeAre
      BEGIN
        SELECT  @pos = PATINDEX('%[' + @wordStartCharacters + ']%',
                                SUBSTRING(@string, @whereWeAre, 4000))
        IF @pos > 0 
          BEGIN
            IF @pos > 1 
              INSERT  INTO @Results
                      ( item, tokentype )
                      SELECT  SUBSTRING(@String, @whereWeAre, @pos - 1), 2
            SELECT  @next = @WhereWeAre + @pos, @ii = @ii + 1
            SELECT  @pos = PATINDEX('%[^' + @wordCharacters + ']%',
                                    SUBSTRING(@string, @next, 4000) + ' ')
            INSERT  INTO @Results
                    ( item, tokentype )
                    SELECT  SUBSTRING(@String, @next - 1, @pos), 1
            SELECT  @WhereWeAre = @next + @pos - 1
          END
        ELSE 
          BEGIN
            IF LEN(REPLACE(
                    SUBSTRING(@String, @whereWeAre, 4000), ' ', '!'
            )) > 0 
              INSERT  INTO @Results
                      ( item, tokentype )
                      SELECT  SUBSTRING(@String, @whereWeAre, 4000), 2
            SELECT  @whereWeAre = @WhereWeAre + 4000
          END
      END
    RETURN
   END 

// Function: DifferenceScore

CREATE FUNCTION [dbo].[DifferenceScore]
  (
    @Sample NVARCHAR(MAX),
    @comparison NVARCHAR(MAX)
  )
RETURNS INT
AS BEGIN
    DECLARE @results TABLE
      (
        token_ID INT IDENTITY(1, 1),
        sequenceNumber INT,
        Sample_ID INT,
        Item VARCHAR(255),
        TokenType INT
      )
/*
This function returns the number of differences it found between two pieces
of text
*/
    INSERT  INTO @results
            ( SequenceNumber, Sample_ID, Item, Tokentype )
            SELECT  seqno, 1, item, tokentype
            FROM    dbo.WordTokens(@sample, DEFAULT, DEFAULT)

    INSERT  INTO @results
            ( SequenceNumber, Sample_ID, Item, Tokentype )
            SELECT  seqno, 2, item, tokentype
            FROM    dbo.WordTokens(@comparison, DEFAULT, DEFAULT)
    DECLARE @closestMatch TABLE
      (
        sequenceNumber INT,
        skew INT
      )
    INSERT  INTO @closestMatch
            ( sequencenumber, skew )
            SELECT  COALESCE(a.sequencenumber, b.sequencenumber),
                    COALESCE(MIN(ABS(COALESCE(b.sequenceNumber, 1000) 
                        - COALESCE(a.sequencenumber, 1000))),
                             -1)
            FROM    ( SELECT  *
                      FROM    @results
                      WHERE   sample_ID = 1 AND tokentype = 1
                    ) a FULL OUTER JOIN ( SELECT  *
                                          FROM    @results
                                          WHERE   sample_ID = 2 
                                              AND tokentype = 1
                                        ) b ON a.item = b.item
            GROUP BY COALESCE(a.sequencenumber, b.sequencenumber)
            ORDER BY COALESCE(a.sequencenumber, b.sequencenumber)



    RETURN ( SELECT SUM(CASE WHEN a.skew - b.skew = 0 THEN 0
                             ELSE 1
                        END)
             FROM   @closestmatch a INNER JOIN @closestMatch b 
                  ON b.sequenceNumber = a.sequenceNumber + 2
           )
   END

// CLR Function: CleanString

public partial class UserDefinedFunctions {
	[SqlFunction]
	public static SqlString CleanString(SqlString Value, SqlString Token) {
		return Value.ToString().Replace(Token.ToString(), "").Trim();
	}
};

Refactorings

No refactoring yet !

39e6d6bdacb7a70e7793917f10f52bbf

school grants

April 5, 2010, April 05, 2010 12:25, permalink

No rating. Login to rate!

What a great resource!

Your refactoring





Format Copy from initial code

or Cancel