SQL Server 2008 and HashBytes - sql-server-2008

I have quite a large nvarchar which I wish to pass to the HashBytes function.
I get the error:
"String or binary would be truncated.
Cannot insert the value NULL into
column 'colname', tbale 'table';
column does not allow nulls. UPDATE
fails. The statement has been
terminated."
Being ever resourceful, I discovered this was due to the HashBytes function having a maximum limit of 8000 bytes. Further searching showed me a 'solution' where my large varchar would be divided and hashed seperately and then later combined with this user defined function:
function [dbo].[udfLargeHashTable] (#algorithm nvarchar(4), #InputDataString varchar(MAX))
RETURNS varbinary(MAX)
AS
BEGIN
DECLARE
#Index int,
#InputDataLength int,
#ReturnSum varbinary(max),
#InputData varbinary(max)
SET #ReturnSum = 0
SET #Index = 1
SET #InputData = convert(binary,#InputDataString)
SET #InputDataLength = DATALENGTH(#InputData)
WHILE #Index <= #InputDataLength
BEGIN
SET #ReturnSum = #ReturnSum + HASHBYTES(#algorithm, SUBSTRING(#InputData, #Index, 8000))
SET #Index = #Index + 8000
END
RETURN #ReturnSum
END
which I call with:
set #ReportDefinitionHash=convert(int,dbo.[udfLargeHashTable]('SHA1',#ReportDefinitionForLookup))
Where #ReportDefinitionHash is int, and #ReportDefinitionForLookup is the varchar
Passing a simple char like 'test' produces a different int with my UDF than a normal call to HashBytes would produce.
Any advice on this issue?

If you can't create a function and have to use something that already exists in the DB:
sys.fn_repl_hash_binary
can be made to work using the syntax:
sys.fn_repl_hash_binary(cast('some really long string' as varbinary(max)))
Taken from: http://www.sqlnotes.info/2012/01/16/generate-md5-value-from-big-data/

Just use this function (taken from Hashing large data strings with a User Defined Function):
create function dbo.fn_hashbytesMAX
( #string nvarchar(max)
, #Algo varchar(10)
)
returns varbinary(20)
as
/************************************************************
*
* Author: Brandon Galderisi
* Last modified: 15-SEP-2009 (by Denis)
* Purpose: uses the system function hashbytes as well
* as sys.fn_varbintohexstr to split an
* nvarchar(max) string and hash in 8000 byte
* chunks hashing each 8000 byte chunk,,
* getting the 40 byte output, streaming each
* 40 byte output into a string then hashing
* that string.
*
*************************************************************/
begin
declare #concat nvarchar(max)
,#NumHash int
,#HASH varbinary(20)
set #NumHash = ceiling((datalength(#string)/2)/(4000.0))
/* HashBytes only supports 8000 bytes so split the string if it is larger */
if #NumHash>1
begin
-- # * 4000 character strings
;with a as (select 1 as n union all select 1) -- 2
,b as (select 1 as n from a ,a a1) -- 4
,c as (select 1 as n from b ,b b1) -- 16
,d as (select 1 as n from c ,c c1) -- 256
,e as (select 1 as n from d ,d d1) -- 65,536
,f as (select 1 as n from e ,e e1) -- 4,294,967,296 = 17+ TRILLION characters
,factored as (select row_number() over (order by n) rn from f)
,factors as (select rn,(rn*4000)+1 factor from factored)
select #concat = cast((
select right(sys.fn_varbintohexstr
(
hashbytes(#Algo, substring(#string, factor - 4000, 4000))
)
, 40) + ''
from Factors
where rn <= #NumHash
for xml path('')
) as nvarchar(max))
set #HASH = dbo.fn_hashbytesMAX(#concat ,#Algo)
end
else
begin
set #HASH = convert(varbinary(20), hashbytes(#Algo, #string))
end
return #HASH
end
And the results are as following:
select
hashbytes('sha1', N'test') --native function with nvarchar input
,hashbytes('sha1', 'test') --native function with varchar input
,dbo.fn_hashbytesMAX('test', 'sha1') --Galderisi's function which casts to nvarchar input
,dbo.fnGetHash('sha1', 'test') --your function
Output:
0x87F8ED9157125FFC4DA9E06A7B8011AD80A53FE1
0xA94A8FE5CCB19BA61C4C0873D391E987982FBBD3
0x87F8ED9157125FFC4DA9E06A7B8011AD80A53FE1
0x00000000AE6DBA4E0F767D06A97038B0C24ED720662ED9F1

I've taken the accepted answer, and modified it a bit with the following improvements:
no longer recursive function
now schema bound
no longer relying on undocumented stored procedures
two versions: one for nvarchar, one for varchar
returns same data size as HASHBYTES, leaving it up to the end user to convert to smaller based on algorithm used. This allows the functions to support future algorithms with larger data returns.
With these changes, the functions can now be used in persisted computed columns as they are now marked deterministic when created.
CREATE FUNCTION dbo.fnHashBytesNVARCHARMAX
(
#Algorithm VARCHAR(10),
#Text NVARCHAR(MAX)
)
RETURNS VARBINARY(8000)
WITH SCHEMABINDING
AS
BEGIN
DECLARE #NumHash INT;
DECLARE #HASH VARBINARY(8000);
SET #NumHash = CEILING(DATALENGTH(#Text) / (8000.0));
/* HashBytes only supports 8000 bytes so split the string if it is larger */
WHILE #NumHash > 1
BEGIN
-- # * 4000 character strings
WITH a AS
(SELECT 1 AS n UNION ALL SELECT 1), -- 2
b AS
(SELECT 1 AS n FROM a, a a1), -- 4
c AS
(SELECT 1 AS n FROM b, b b1), -- 16
d AS
(SELECT 1 AS n FROM c, c c1), -- 256
e AS
(SELECT 1 AS n FROM d, d d1), -- 65,536
f AS
(SELECT 1 AS n FROM e, e e1), -- 4,294,967,296 = 17+ TRILLION characters
factored AS
(SELECT ROW_NUMBER() OVER (ORDER BY n) rn FROM f),
factors AS
(SELECT rn, (rn * 4000) + 1 factor FROM factored)
SELECT #Text = CAST
(
(
SELECT CONVERT(VARCHAR(MAX), HASHBYTES(#Algorithm, SUBSTRING(#Text, factor - 4000, 4000)), 1)
FROM factors
WHERE rn <= #NumHash
FOR XML PATH('')
) AS NVARCHAR(MAX)
);
SET #NumHash = CEILING(DATALENGTH(#Text) / (8000.0));
END;
SET #HASH = CONVERT(VARBINARY(8000), HASHBYTES(#Algorithm, #Text));
RETURN #HASH;
END;
CREATE FUNCTION dbo.fnHashBytesVARCHARMAX
(
#Algorithm VARCHAR(10),
#Text VARCHAR(MAX)
)
RETURNS VARBINARY(8000)
WITH SCHEMABINDING
AS
BEGIN
DECLARE #NumHash INT;
DECLARE #HASH VARBINARY(8000);
SET #NumHash = CEILING(DATALENGTH(#Text) / (8000.0));
/* HashBytes only supports 8000 bytes so split the string if it is larger */
WHILE #NumHash > 1
BEGIN
-- # * 4000 character strings
WITH a AS
(SELECT 1 AS n UNION ALL SELECT 1), -- 2
b AS
(SELECT 1 AS n FROM a, a a1), -- 4
c AS
(SELECT 1 AS n FROM b, b b1), -- 16
d AS
(SELECT 1 AS n FROM c, c c1), -- 256
e AS
(SELECT 1 AS n FROM d, d d1), -- 65,536
f AS
(SELECT 1 AS n FROM e, e e1), -- 4,294,967,296 = 17+ TRILLION characters
factored AS
(SELECT ROW_NUMBER() OVER (ORDER BY n) rn FROM f),
factors AS
(SELECT rn, (rn * 8000) + 1 factor FROM factored)
SELECT #Text = CAST
(
(
SELECT CONVERT(VARCHAR(MAX), HASHBYTES(#Algorithm, SUBSTRING(#Text, factor - 8000, 8000)), 1)
FROM factors
WHERE rn <= #NumHash
FOR XML PATH('')
) AS NVARCHAR(MAX)
);
SET #NumHash = CEILING(DATALENGTH(#Text) / (8000.0));
END;
SET #HASH = CONVERT(VARBINARY(8000), HASHBYTES(#Algorithm, #Text));
RETURN #HASH;
END;

You could write a SQL CLR function:
[Microsoft.SqlServer.Server.SqlFunction]
public static SqlBinary BigHashBytes(SqlString algorithm, SqlString data)
{
var algo = HashAlgorithm.Create(algorithm.Value);
var bytes = Encoding.UTF8.GetBytes(data.Value);
return new SqlBinary(algo.ComputeHash(bytes));
}
And then it can be called in SQL like this:
--these return the same value
select HASHBYTES('md5', 'test stuff')
select dbo.BigHashBytes('md5', 'test stuff')
The BigHashBytes is only necessary if the length would be over 8k.

tested and working
select master.sys.fn_repl_hash_binary(someVarbinaryMaxValue)
moreover not complicated :)

This can be used as function body, too:
DECLARE #A NVARCHAR(MAX) = N'test'
DECLARE #res VARBINARY(MAX) = 0x
DECLARE #position INT = 1
,#len INT = DATALENGTH(#A)
WHILE 1 = 1
BEGIN
SET #res = #res + HASHBYTES('SHA2_256', SUBSTRING(#A, #position, 4000))
SET #position = #position+4000
IF #Position > #len
BREAK
END
SELECT HASHBYTES('SHA2_256',#res)
The idea si to HASH each 4000 part of the NVARCHAR(MAX) string and concatanate the results. Then to HASH the latter result.

It seems the easiest solution is to write a recursive hashing algorithm that parses the input text value into sub varchar(8000) segments.
I arbitrarily chose to slice the input string into 7500 character segments
The hashing algorithm returns a varbinary(20) which can easily be converted into a varchar(20)
ALTER FUNCTION [dbo].[BigHash]
(
#TextValue nvarchar(max)
)
RETURNS varbinary(20)
AS
BEGIN
if #TextValue = null
return hashbytes('SHA1', 'null')
Declare #FirstPart as varchar(7500)
Declare #Remainder as varchar(max)
Declare #RemainderHash as varbinary(20)
Declare #BinaryValue as varbinary(20)
Declare #TextLength as integer
Set #TextLength = len(#TextValue)
if #TextLength > 7500
Begin
Set #FirstPart = substring(#TextValue, 1, 7500)
Set #Remainder = substring(#TextValue, 7501, #TextLength - 7500)
Set #RemainderHash = dbo.BigHash(#Remainder)
Set #BinaryValue = hashbytes('SHA1', #FirstPart + convert( varchar(20), #RemainderHash, 2 ))
return #BinaryValue
End
else
Begin
Set #FirstPart = substring(#TextValue, 1, #TextLength)
Set #BinaryValue = hashbytes('SHA1', #FirstPart)
return #BinaryValue
End
return null
END

Related

Parse a String Expression into Columns

I have a string for example 32,21C2L5N8C stored in one field. Now I want to expand this string into as follows:
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,CCCCCCCCCCCCCCCCCCCCCLLNNNNNCCCCCCCC
After getting the above string, I want to count number of commas,C's,L's and N's.
Can some one help me with this please?
you can extract the numbers and none numeric characters then then replicate each character, in SQL Server you can use patindex and replicate functions (explanations are in code):
--table variable for holding extracted numbers and none number characters
declare #t table(id int identity(1,1), num int, nonnum char(1))
declare #str1 varchar(50)='32,21C2L5N8C' -- your current given string
declare #int1 varchar(50)='' --for number
declare #str2 varchar(50)='' --for none numeric characters
declare #result varchar(max)=''
while len(#str1)>1 --for parsing the given string
begin
while (Select PatIndex('%[0-9]%', #str1))=1 --extract number
begin
set #int1=#int1+substring(#str1,1,1)
set #str1=substring(#str1,2,len(#str1)-1)
end
set #str2=substring(#str1,1,1) --extract none numeric character
set #str1=substring(#str1,2,len(#str1)-1)
insert into #t(num,nonnum)values (#int1,#str2)
set #int1=''
set #str2=''
end
select #result=#result+replicate(nonnum,num) from #t
select #result
Output:
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,CCCCCCCCCCCCCCCCCCCCCLLNNNNNCCCCCCCC
Edit 1: if you have characters with no number in front of it inside the given string and you want to print it once you can add an extra while loop in above code:
--table variable for holding extracted numbers and none number characters
declare #t table(id int identity(1,1), num int, nonnum char(1))
declare #str1 varchar(50)='32,21C2L5NC' -- your current given string
declare #int1 varchar(50)='' --for number
declare #str2 varchar(50)='' --for none numeric characters
declare #result varchar(max)=''
while len(#str1)>1 --for parsing the given string
begin
while (Select PatIndex('%[0-9]%', #str1))=1 --extract number
begin
set #int1=#int1+substring(#str1,1,1)
set #str1=substring(#str1,2,len(#str1)-1)
end
set #str2=substring(#str1,1,1) --extract none numeric character
set #str1=substring(#str1,2,len(#str1)-1)
insert into #t(num,nonnum)values (#int1,#str2)
set #int1=''
set #str2=''
while (isnumeric(substring(#str1,1,1))=0 and len(#str1)>=1)
begin
set #str2=substring(#str1,1,1) --extract none numeric character
set #str1=substring(#str1,2,len(#str1)-1)
insert into #t(num,nonnum)values (1,#str2)
set #int1=''
set #str2=''
end
end
select #result=#result+replicate(nonnum,num) from #t
select #result
Output:
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,CCCCCCCCCCCCCCCCCCCCCLLNNNNNC
Edit 2: if you want the number of repeats of each character, just query the #t table variable in above code, I mean at the end of above query say:
select nonnum [char],num [repeat] from #t
Output:
char repeat
, 32
C 21
L 2
N 5
C 1
You could do this by using a Pattern Splitter. Here is one taken from Dwain Camp's article. The function used, PatternSplitCM, is created by Chris Morris.
CREATE FUNCTION [dbo].[PatternSplitCM]
(
#List VARCHAR(8000) = NULL
,#Pattern VARCHAR(50)
) RETURNS TABLE WITH SCHEMABINDING
AS
RETURN
WITH numbers AS (
SELECT TOP(ISNULL(DATALENGTH(#List), 0))
n = ROW_NUMBER() OVER(ORDER BY (SELECT NULL))
FROM
(VALUES (0),(0),(0),(0),(0),(0),(0),(0),(0),(0)) d (n),
(VALUES (0),(0),(0),(0),(0),(0),(0),(0),(0),(0)) e (n),
(VALUES (0),(0),(0),(0),(0),(0),(0),(0),(0),(0)) f (n),
(VALUES (0),(0),(0),(0),(0),(0),(0),(0),(0),(0)) g (n))
SELECT
ItemNumber = ROW_NUMBER() OVER(ORDER BY MIN(n)),
Item = SUBSTRING(#List,MIN(n),1+MAX(n)-MIN(n)),
[Matched]
FROM (
SELECT n, y.[Matched], Grouper = n - ROW_NUMBER() OVER(ORDER BY y.[Matched],n)
FROM numbers
CROSS APPLY (
SELECT [Matched] = CASE WHEN SUBSTRING(#List,n,1) LIKE #Pattern THEN 1 ELSE 0 END
) y
) d
GROUP BY [Matched], Grouper
Using the function above, you would want to split your string using the pattern '[^0-9]', which means not numeric. You would then pivot the result so that the corresponding number and character will be on the same row. After that, you use REPLICATE to generate the strings and concatenate them at then end.
Your final query is:
DECLARE
#String VARCHAR(8000),
#Pattern VARCHAR(50),
#Result VARCHAR(MAX)
SELECT
#String = '32,21C2L5N8C',
#Pattern = '[^0-9]',
#Result = ''
;WITH Cte AS(
SELECT
ID = (s.ItemNumber + 1)/ 2,
Number = MAX(CASE WHEN s.ItemNumber % 2 = 1 THEN s.Item END),
Character = MAX(CASE WHEN s.ItemNumber % 2 = 0 THEN s.Item END)
FROM dbo.[PatternSplitCM](#String, #Pattern) s
GROUP BY (s.ItemNumber + 1)/ 2
)
SELECT #Result = #Result + REPLICATE(Character, Number) FROM Cte ORDER BY ID
SELECT #Result
SQL Fiddle
Here is the step by step explanation:
First, split the given string using the pattern '[^0-9]'.
SELECT * FROM dbo.[PatternSplitCM](#String, #Pattern) s
The result is:
ItemNumber Item Matched
-------------------- ---------- -----------
1 32 0
2 , 1
3 21 0
4 C 1
5 2 0
6 L 1
7 5 0
8 N 1
9 8 0
10 C 1
Second, pivot the result so that the corresponding number and character will be on the same row:
SELECT
ID = (s.ItemNumber + 1)/ 2,
Number = MAX(CASE WHEN s.ItemNumber % 2 = 1 THEN s.Item END),
Character = MAX(CASE WHEN s.ItemNumber % 2 = 0 THEN s.Item END)
FROM dbo.[PatternSplitCM](#String, #Pattern) s
GROUP BY (s.ItemNumber + 1)/ 2
The result is:
ID Number Character
------ ---------- ----------
1 32 ,
2 21 C
3 2 L
4 5 N
5 8 C
Last, use REPLICATE(Number, Character) to generate each string and concatenate them to get the final result:
SELECT #Result = #Result + REPLICATE(Character, Number) FROM Cte ORDER BY ID
SELECT #Result
The result is:
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,CCCCCCCCCCCCCCCCCCCCCLLNNNNNCCCCCCCC

sql server 2008 not generating random value

I'm having a problem with random values being generated for each row in a result set in SQL Server 2008. I found a similar question here, but upon implementing the proposed answer, I saw the same problem as before. When running the query I have provided below, it seems that the same values will sometimes show up in consecutive rows, even though I'm calling for a new NEWID() with each row.
DECLARE #Id int = 0
DECLARE #Counter int = 1
DECLARE #Value int
CREATE TABLE #Table1
(
id int identity(1,1)
,Value int
)
WHILE #Counter < 100000
BEGIN
INSERT INTO #Table1 (Value)
SELECT CAST(RAND(CHECKSUM(NEWID())) * 100000 as INT)
SET #Counter += 1
END
SET #Counter = 0
WHILE #Counter < 5
BEGIN
SELECT
#Value = T.Value
,#Id = T.id
FROM #Table1 T
WHERE T.id = CAST(RAND(CHECKSUM(NEWID())) * 100000 as INT) + 1 + #Counter
IF #Id <> 0
SELECT #Value AS Value ,#Id as ID
SET #Counter += 1
END
DROP TABLE #Table1
If I change the INT to a BIGINT, as suggested in the link I provided, nothing is solved, so I don't believe that it's an "overflow" issue.
If I take the calculation out of the select, I don't get the doubled rows:
DECLARE #Id int = 0
DECLARE #Counter int = 1
DECLARE #Value int
-- new variable
DECLARE #DERIVED INT
CREATE TABLE #Table1
(
id int identity(1,1)
,Value int
)
WHILE #Counter < 100000
BEGIN
INSERT INTO #Table1 (Value)
SELECT CAST(RAND(CHECKSUM(NEWID())) * 100000 as INT)
SET #Counter += 1
END
SET #Counter = 0
WHILE #Counter < 5
BEGIN
--set here to remove calculation from the select
SET #DERIVED = CAST(RAND(CHECKSUM(NEWID())) * 100000 as INT) + 1 + #Counter;
SELECT
#Value = T.Value
,#Id = T.id
FROM #Table1 T
WHERE T.id = #DERIVED
IF #Id <> 0
SELECT #Value AS Value ,#Id as ID;
SET #Counter += 1
END
DROP TABLE #Table1
I'm seeing the duplicates every time with the pseudorandom generator inside the select. Oddly enough, I get about the same frequency of duplicates on the insert loop whether or not the calculation is inside the insert... select. It could be coincidence, since we are dealing with a randomly selected number. Note also, that since you're adding to the pseudorandom result, the results aren't technically duplicates. They're descending sequences:
11111 + 1 + 1 = 11113
11110 + 1 + 2 = 11113
Same overall result, different pseudorandom result. However, if I change
CAST(RAND(CHECKSUM(NEWID())) * 100000 as INT) + 1 + #Counter
to
CAST(RAND(CHECKSUM(NEWID())) * 100000 as INT) + #Counter + #Counter
I still consistently get duplicates. That implies that the optimizer may be caching/re-using values, at least on the select. I'd call that improper for a non-deterministic function call. I get similar results on 10.0.1600 and 10.50.1600 (2008 RTM and 2008R2 RTM).

T-SQL: split and aggregate comma-separated values

I have the following table with each row having comma-separated values:
ID
-----------------------------------------------------------------------------
10031,10042
10064,10023,10060,10065,10003,10011,10009,10012,10027,10004,10037,10039
10009
20011,10027,10032,10063,10023,10033,20060,10012,10020,10031,10011,20036,10041
I need to get a count for each ID (a groupby).
I am just trying to avoid cursor implementation and stumped on how to do this without cursors.
Any Help would be appreciated !
You will want to use a split function:
create FUNCTION [dbo].[Split](#String varchar(MAX), #Delimiter char(1))
returns #temptable TABLE (items varchar(MAX))
as
begin
declare #idx int
declare #slice varchar(8000)
select #idx = 1
if len(#String)<1 or #String is null return
while #idx!= 0
begin
set #idx = charindex(#Delimiter,#String)
if #idx!=0
set #slice = left(#String,#idx - 1)
else
set #slice = #String
if(len(#slice)>0)
insert into #temptable(Items) values(#slice)
set #String = right(#String,len(#String) - #idx)
if len(#String) = 0 break
end
return
end;
And then you can query the data in the following manner:
select items, count(items)
from table1 t1
cross apply dbo.split(t1.id, ',')
group by items
See SQL Fiddle With Demo
Well, the solution i always use, and probably there might be a better way, is to use a function that will split everything. No use for cursors, just a while loop.
if OBJECT_ID('splitValueByDelimiter') is not null
begin
drop function splitValueByDelimiter
end
go
create function splitValueByDelimiter (
#inputValue varchar(max)
, #delimiter varchar(1)
)
returns #results table (value varchar(max))
as
begin
declare #delimeterIndex int
, #tempValue varchar(max)
set #delimeterIndex = 1
while #delimeterIndex > 0 and len(isnull(#inputValue, '')) > 0
begin
set #delimeterIndex = charindex(#delimiter, #inputValue)
if #delimeterIndex > 0
set #tempValue = left(#inputValue, #delimeterIndex - 1)
else
set #tempValue = #inputValue
if(len(#tempValue)>0)
begin
insert
into #results
select #tempValue
end
set #inputValue = right(#inputValue, len(#inputValue) - #delimeterIndex)
end
return
end
After that you can call the output like this :
if object_id('test') is not null
begin
drop table test
end
go
create table test (
Id varchar(max)
)
insert
into test
select '10031,10042'
union all select '10064,10023,10060,10065,10003,10011,10009,10012,10027,10004,10037,10039'
union all select '10009'
union all select '20011,10027,10032,10063,10023,10033,20060,10012,10020,10031,10011,20036,10041'
select value
from test
cross apply splitValueByDelimiter(Id, ',')
Hope it helps, although i am still looping through everything
After reiterating the comment above about NOT putting multiple values into a single column (Use a separate child table with one value per row!),
Nevertheless, one possible approach: use a UDF to convert delimited string to a table. Once all the values have been converted to tables, combine all the tables into one table and do a group By on that table.
Create Function dbo.ParseTextString (#S Text, #delim VarChar(5))
Returns #tOut Table
(ValNum Integer Identity Primary Key,
sVal VarChar(8000))
As
Begin
Declare #dlLen TinyInt -- Length of delimiter
Declare #wind VarChar(8000) -- Will Contain Window into text string
Declare #winLen Integer -- Length of Window
Declare #isLastWin TinyInt -- Boolean to indicate processing Last Window
Declare #wPos Integer -- Start Position of Window within Text String
Declare #roVal VarChar(8000)-- String Data to insert into output Table
Declare #BtchSiz Integer -- Maximum Size of Window
Set #BtchSiz = 7900 -- (Reset to smaller values to test routine)
Declare #dlPos Integer -- Position within Window of next Delimiter
Declare #Strt Integer -- Start Position of each data value within Window
-- -------------------------------------------------------------------------
-- ---------------------------
If #delim is Null Set #delim = '|'
If DataLength(#S) = 0 Or
Substring(#S, 1, #BtchSiz) = #delim Return
-- --------------------------------------------
Select #dlLen = DataLength(#delim),
#Strt = 1, #wPos = 1,
#wind = Substring(#S, 1, #BtchSiz)
Select #winLen = DataLength(#wind),
#isLastWin = Case When DataLength(#wind) = #BtchSiz
Then 0 Else 1 End,
#dlPos = CharIndex(#delim, #wind, #Strt)
-- --------------------------------------------
While #Strt <= #winLen
Begin
If #dlPos = 0 Begin -- No More delimiters in window
If #isLastWin = 1 Set #dlPos = #winLen + 1
Else Begin
Set #wPos = #wPos + #Strt - 1
Set #wind = Substring(#S, #wPos, #BtchSiz)
-- ----------------------------------------
Select #winLen = DataLength(#wind), #Strt = 1,
#isLastWin = Case When DataLength(#wind) = #BtchSiz
Then 0 Else 1 End,
#dlPos = CharIndex(#delim, #wind, 1)
If #dlPos = 0 Set #dlPos = #winLen + 1
End
End
-- -------------------------------
Insert #tOut (sVal)
Select LTrim(Substring(#wind,
#Strt, #dlPos - #Strt))
-- -------------------------------
-- Move #Strt to char after last delimiter
Set #Strt = #dlPos + #dlLen
Set #dlPos = CharIndex(#delim, #wind, #Strt)
End
Return
End
Then write, (using your table schema),
Declare #AllVals VarChar(8000)
Select #AllVals = Coalesce(#allVals + ',', '') + ID
From Table Where ID Is Not null
-- -----------------------------------------
Select sVal, Count(*)
From dbo.ParseTextString(#AllVals, ',')
Group By sval

T-SQL strip all non-alpha and non-numeric characters

Is there a smarter way to remove all special characters rather than having a series of about 15 nested replace statements?
The following works, but only handles three characters (ampersand, blank and period).
select CustomerID, CustomerName,
Replace(Replace(Replace(CustomerName,'&',''),' ',''),'.','') as CustomerNameStripped
from Customer
One flexible-ish way;
CREATE FUNCTION [dbo].[fnRemovePatternFromString](#BUFFER VARCHAR(MAX), #PATTERN VARCHAR(128)) RETURNS VARCHAR(MAX) AS
BEGIN
DECLARE #POS INT = PATINDEX(#PATTERN, #BUFFER)
WHILE #POS > 0 BEGIN
SET #BUFFER = STUFF(#BUFFER, #POS, 1, '')
SET #POS = PATINDEX(#PATTERN, #BUFFER)
END
RETURN #BUFFER
END
select dbo.fnRemovePatternFromString('cake & beer $3.99!?c', '%[$&.!?]%')
(No column name)
cake beer 399c
Create a function:
CREATE FUNCTION dbo.StripNonAlphaNumerics
(
#s VARCHAR(255)
)
RETURNS VARCHAR(255)
AS
BEGIN
DECLARE #p INT = 1, #n VARCHAR(255) = '';
WHILE #p <= LEN(#s)
BEGIN
IF SUBSTRING(#s, #p, 1) LIKE '[A-Za-z0-9]'
BEGIN
SET #n += SUBSTRING(#s, #p, 1);
END
SET #p += 1;
END
RETURN(#n);
END
GO
Then:
SELECT Result = dbo.StripNonAlphaNumerics
('My Customer''s dog & #1 friend are dope, yo!');
Results:
Result
------
MyCustomersdog1friendaredopeyo
To make it more flexible, you could pass in the pattern you want to allow:
CREATE FUNCTION dbo.StripNonAlphaNumerics
(
#s VARCHAR(255),
#pattern VARCHAR(255)
)
RETURNS VARCHAR(255)
AS
BEGIN
DECLARE #p INT = 1, #n VARCHAR(255) = '';
WHILE #p <= LEN(#s)
BEGIN
IF SUBSTRING(#s, #p, 1) LIKE #pattern
BEGIN
SET #n += SUBSTRING(#s, #p, 1);
END
SET #p += 1;
END
RETURN(#n);
END
GO
Then:
SELECT r = dbo.StripNonAlphaNumerics
('Bob''s dog & #1 friend are dope, yo!', '[A-Za-z0-9]');
Results:
r
------
Bobsdog1friendaredopeyo
I faced this problem several years ago, so I wrote a SQL function to do the trick. Here is the original article (was used to scrape text out of HTML). I have since updated the function, as follows:
IF (object_id('dbo.fn_CleanString') IS NOT NULL)
BEGIN
PRINT 'Dropping: dbo.fn_CleanString'
DROP function dbo.fn_CleanString
END
GO
PRINT 'Creating: dbo.fn_CleanString'
GO
CREATE FUNCTION dbo.fn_CleanString
(
#string varchar(8000)
)
returns varchar(8000)
AS
BEGIN
---------------------------------------------------------------------------------------------------
-- Title: CleanString
-- Date Created: March 26, 2011
-- Author: William McEvoy
--
-- Description: This function removes special ascii characters from a string.
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
declare #char char(1),
#len int,
#count int,
#newstring varchar(8000),
#replacement char(1)
select #count = 1,
#len = 0,
#newstring = '',
#replacement = ' '
---------------------------------------------------------------------------------------------------
-- M A I N P R O C E S S I N G
---------------------------------------------------------------------------------------------------
-- Remove Backspace characters
select #string = replace(#string,char(8),#replacement)
-- Remove Tabs
select #string = replace(#string,char(9),#replacement)
-- Remove line feed
select #string = replace(#string,char(10),#replacement)
-- Remove carriage return
select #string = replace(#string,char(13),#replacement)
-- Condense multiple spaces into a single space
-- This works by changing all double spaces to be OX where O = a space, and X = a special character
-- then all occurrences of XO are changed to O,
-- then all occurrences of X are changed to nothing, leaving just the O which is actually a single space
select #string = replace(replace(replace(ltrim(rtrim(#string)),' ', ' ' + char(7)),char(7)+' ',''),char(7),'')
-- Parse each character, remove non alpha-numeric
select #len = len(#string)
WHILE (#count <= #len)
BEGIN
-- Examine the character
select #char = substring(#string,#count,1)
IF (#char like '[a-z]') or (#char like '[A-Z]') or (#char like '[0-9]')
select #newstring = #newstring + #char
ELSE
select #newstring = #newstring + #replacement
select #count = #count + 1
END
return #newstring
END
GO
IF (object_id('dbo.fn_CleanString') IS NOT NULL)
PRINT 'Function created.'
ELSE
PRINT 'Function NOT created.'
GO
I know this is an old thread, but still, might be handy for others.
Here's a quick and dirty (Which I've done inversely - stripping out non-numerics) - using a recursive CTE.
What makes this one nice for me is that it's an inline function - so gets around the nasty RBAR effect of the usual scalar and table-valued functions.
Adjust your filter as needs be to include or exclude whatever char types.
Create Function fncV1_iStripAlphasFromData (
#iString Varchar(max)
)
Returns
Table With Schemabinding
As
Return(
with RawData as
(
Select #iString as iString
)
,
Anchor as
(
Select Case(IsNumeric (substring(iString, 1, 1))) when 1 then substring(iString, 1, 1) else '' End as oString, 2 as CharPos from RawData
UNION ALL
Select a.oString + Case(IsNumeric (substring(#iString, a.CharPos, 1))) when 1 then substring(#iString, a.CharPos, 1) else '' End, a.CharPos + 1
from RawData r
Inner Join Anchor a on a.CharPos <= len(rtrim(ltrim(#iString)))
)
Select top 1 oString from Anchor order by CharPos Desc
)
Go
select * from dbo.fncV1_iStripAlphasFromData ('00000')
select * from dbo.fncV1_iStripAlphasFromData ('00A00')
select * from dbo.fncV1_iStripAlphasFromData ('12345ABC6789!&*0')
If you can use SQL CLR you can use .NET regular expressions for this.
There is a third party (free) package that includes this and more - SQL Sharp .

Handling larger strings in Sql Server 2008

We have an stored procedure that we created so that user can write comma separated search tags in their software product's admin. So he can add comma-separated tags and in case if he wants to edit them, we read from the table all the tags, recreate them as comma-separated values (CSV) in stored procedure and returns that to the calling code. What happened recently, the user complained that he could not see the new CSVs he wrote. I looked into it and found out that the stored procedure is truncating the string when it reads values from database and creates CSV string. The string is of type nvarchar, and because its exceeding the max characters of 4000 limit, the values gets truncated. Any ideas on how to work out that problem.
Find my code underneath.
BEGIN
BEGIN
Declare #Synonyms Table
(
RowID int Identity(1,1),
SynonymID int,
[Synonym] nvarchar(4000)
);
SET NOCOUNT ON;
Insert #Synonyms(SynonymID, [Synonym])
Select distinct SynonymID, [Synonym] From RF_SearchSynonyms with(nolock) Where SearchTermID = #SearchTermID And ActiveInd = 1
If((Select COUNT(RowID) From #Synonyms) <> 0)
BEGIN
Declare #CurrentRow int = (Select MIN(RowID) From #Synonyms),
#TotalRows int = (Select MAX(RowID) From #Synonyms),
#Synonyms_CSV nvarchar(4000) = '';
WHILE #CurrentRow <= #TotalRows
BEGIN
Declare #TempSyn nvarchar(500);
Select #TempSyn = [Synonym] + ',' From #Synonyms Where RowID = #CurrentRow;
Set #Synonyms_CSV = #Synonyms_CSV + LTRIM(RTRIM(LOWER(#TempSyn)));
SET #CurrentRow = #CurrentRow + 1
END
END
Else
BEGIN
Set #Synonyms_CSV = '';
END
END
BEGIN
Declare #SKUs Table
(
RowID int Identity(1,1),
SkuID int,
SKU nvarchar(15)
);
SET NOCOUNT ON;
Insert #SKUs(SkuID, SKU)
Select distinct SkuID, SKU From RF_SearchSkus with(nolock) Where SearchTermID = #SearchTermID And ActiveInd = 1
If((Select COUNT(RowID) From #SKUs) <> 0)
BEGIN
Declare #CurrentRow1 int = (Select MIN(RowID) From #SKUs),
#TotalRows1 int = (Select MAX(RowID) From #SKUs),
#Skus_CSV nvarchar(4000) = '';
WHILE #CurrentRow1 <= #TotalRows1
BEGIN
Declare #TempSku nvarchar(15);
Select #TempSku = SKU + ',' From #SKUs Where RowID = #CurrentRow1;
Set #Skus_CSV = #Skus_CSV + LTRIM(RTRIM(#TempSku));
SET #CurrentRow1 = #CurrentRow1 + 1
END
END
Else
BEGIN
Set #Skus_CSV = '';
END
END
BEGIN
Declare #Combined varchar(8000),
#syn_len int = 0,
#sku_len int = 0;
Select #syn_len = LEN(#Synonyms_CSV);
Select #sku_len = LEN(#Skus_CSV);
Select #Combined = #Synonyms_CSV + '-_-' + #Skus_CSV;
Select #Synonyms_CSV + '-_-' + #Skus_CSV;
END
END
I can't use text and ntext as they do not play nice with concatenation operations.
Thanks.
How are your declaring the string parameter?
nvarchar(max)
supports up to 2^32-1 (2GB)
See this link.