T-SQL strip all non-alpha and non-numeric characters - sql-server-2008

Is there a smarter way to remove all special characters rather than having a series of about 15 nested replace statements?
The following works, but only handles three characters (ampersand, blank and period).
select CustomerID, CustomerName,
Replace(Replace(Replace(CustomerName,'&',''),' ',''),'.','') as CustomerNameStripped
from Customer

One flexible-ish way;
CREATE FUNCTION [dbo].[fnRemovePatternFromString](#BUFFER VARCHAR(MAX), #PATTERN VARCHAR(128)) RETURNS VARCHAR(MAX) AS
BEGIN
DECLARE #POS INT = PATINDEX(#PATTERN, #BUFFER)
WHILE #POS > 0 BEGIN
SET #BUFFER = STUFF(#BUFFER, #POS, 1, '')
SET #POS = PATINDEX(#PATTERN, #BUFFER)
END
RETURN #BUFFER
END
select dbo.fnRemovePatternFromString('cake & beer $3.99!?c', '%[$&.!?]%')
(No column name)
cake beer 399c

Create a function:
CREATE FUNCTION dbo.StripNonAlphaNumerics
(
#s VARCHAR(255)
)
RETURNS VARCHAR(255)
AS
BEGIN
DECLARE #p INT = 1, #n VARCHAR(255) = '';
WHILE #p <= LEN(#s)
BEGIN
IF SUBSTRING(#s, #p, 1) LIKE '[A-Za-z0-9]'
BEGIN
SET #n += SUBSTRING(#s, #p, 1);
END
SET #p += 1;
END
RETURN(#n);
END
GO
Then:
SELECT Result = dbo.StripNonAlphaNumerics
('My Customer''s dog & #1 friend are dope, yo!');
Results:
Result
------
MyCustomersdog1friendaredopeyo
To make it more flexible, you could pass in the pattern you want to allow:
CREATE FUNCTION dbo.StripNonAlphaNumerics
(
#s VARCHAR(255),
#pattern VARCHAR(255)
)
RETURNS VARCHAR(255)
AS
BEGIN
DECLARE #p INT = 1, #n VARCHAR(255) = '';
WHILE #p <= LEN(#s)
BEGIN
IF SUBSTRING(#s, #p, 1) LIKE #pattern
BEGIN
SET #n += SUBSTRING(#s, #p, 1);
END
SET #p += 1;
END
RETURN(#n);
END
GO
Then:
SELECT r = dbo.StripNonAlphaNumerics
('Bob''s dog & #1 friend are dope, yo!', '[A-Za-z0-9]');
Results:
r
------
Bobsdog1friendaredopeyo

I faced this problem several years ago, so I wrote a SQL function to do the trick. Here is the original article (was used to scrape text out of HTML). I have since updated the function, as follows:
IF (object_id('dbo.fn_CleanString') IS NOT NULL)
BEGIN
PRINT 'Dropping: dbo.fn_CleanString'
DROP function dbo.fn_CleanString
END
GO
PRINT 'Creating: dbo.fn_CleanString'
GO
CREATE FUNCTION dbo.fn_CleanString
(
#string varchar(8000)
)
returns varchar(8000)
AS
BEGIN
---------------------------------------------------------------------------------------------------
-- Title: CleanString
-- Date Created: March 26, 2011
-- Author: William McEvoy
--
-- Description: This function removes special ascii characters from a string.
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
declare #char char(1),
#len int,
#count int,
#newstring varchar(8000),
#replacement char(1)
select #count = 1,
#len = 0,
#newstring = '',
#replacement = ' '
---------------------------------------------------------------------------------------------------
-- M A I N P R O C E S S I N G
---------------------------------------------------------------------------------------------------
-- Remove Backspace characters
select #string = replace(#string,char(8),#replacement)
-- Remove Tabs
select #string = replace(#string,char(9),#replacement)
-- Remove line feed
select #string = replace(#string,char(10),#replacement)
-- Remove carriage return
select #string = replace(#string,char(13),#replacement)
-- Condense multiple spaces into a single space
-- This works by changing all double spaces to be OX where O = a space, and X = a special character
-- then all occurrences of XO are changed to O,
-- then all occurrences of X are changed to nothing, leaving just the O which is actually a single space
select #string = replace(replace(replace(ltrim(rtrim(#string)),' ', ' ' + char(7)),char(7)+' ',''),char(7),'')
-- Parse each character, remove non alpha-numeric
select #len = len(#string)
WHILE (#count <= #len)
BEGIN
-- Examine the character
select #char = substring(#string,#count,1)
IF (#char like '[a-z]') or (#char like '[A-Z]') or (#char like '[0-9]')
select #newstring = #newstring + #char
ELSE
select #newstring = #newstring + #replacement
select #count = #count + 1
END
return #newstring
END
GO
IF (object_id('dbo.fn_CleanString') IS NOT NULL)
PRINT 'Function created.'
ELSE
PRINT 'Function NOT created.'
GO

I know this is an old thread, but still, might be handy for others.
Here's a quick and dirty (Which I've done inversely - stripping out non-numerics) - using a recursive CTE.
What makes this one nice for me is that it's an inline function - so gets around the nasty RBAR effect of the usual scalar and table-valued functions.
Adjust your filter as needs be to include or exclude whatever char types.
Create Function fncV1_iStripAlphasFromData (
#iString Varchar(max)
)
Returns
Table With Schemabinding
As
Return(
with RawData as
(
Select #iString as iString
)
,
Anchor as
(
Select Case(IsNumeric (substring(iString, 1, 1))) when 1 then substring(iString, 1, 1) else '' End as oString, 2 as CharPos from RawData
UNION ALL
Select a.oString + Case(IsNumeric (substring(#iString, a.CharPos, 1))) when 1 then substring(#iString, a.CharPos, 1) else '' End, a.CharPos + 1
from RawData r
Inner Join Anchor a on a.CharPos <= len(rtrim(ltrim(#iString)))
)
Select top 1 oString from Anchor order by CharPos Desc
)
Go
select * from dbo.fncV1_iStripAlphasFromData ('00000')
select * from dbo.fncV1_iStripAlphasFromData ('00A00')
select * from dbo.fncV1_iStripAlphasFromData ('12345ABC6789!&*0')

If you can use SQL CLR you can use .NET regular expressions for this.
There is a third party (free) package that includes this and more - SQL Sharp .

Related

Split string values separated by a semicolon using sProc in MySQL 8

I need split this string using stored Procedure in MySQL 8 version
"John;Elizabeth;Mark;Zagor;Annie;Lucy;Peter;Robin;Wilson;Tom;Bettie;Myriam;Frankie;Nick;Marilyn"
The string values are separated by a semicolon.
My sProc below.
The problem it's in output.
The first name splitted on this string it's Elizabeth and not John.
Where is it John?
All other names are present in output of sProc, only John is missing...
What am I doing wrong?
BEGIN
DECLARE tNameSeries LONGTEXT;
DECLARE t_tNameSeries LONGTEXT;
SET tNameSeries = "John;Elizabeth;Mark;Zagor;Annie;Lucy;Peter;Robin;Wilson;Tom;Bettie;Myriam;Frankie;Nick;Marilyn";
WHILE LOCATE(";",tNameSeries) > 0 DO
SET tNameSeries = REPLACE (tNameSeries, (SELECT LEFT(tNameSeries,LOCATE(";",tNameSeries))),'');
SET t_tNameSeries = SUBSTRING_INDEX(tNameSeries,";",1);
SELECT t_tNameSeries;
END WHILE;
END
update
Using ths edit sProc the output is only John
BEGIN
DECLARE tNameSeries LONGTEXT;
DECLARE t_tNameSeries LONGTEXT;
SET tNameSeries = "John;Elizabeth;Mark;Zagor;Annie;Lucy;Peter;Robin;Wilson;Tom;Bettie;Myriam;Frankie;Nick;Marilyn";
WHILE LOCATE(";",tNameSeries) > 0 DO
SET t_tNameSeries = SUBSTRING_INDEX(tNameSeries,";",1);
SET tNameSeries = REPLACE (t_tNameSeries, (SELECT LEFT(t_tNameSeries,LOCATE(";",t_tNameSeries))),'');
SELECT tNameSeries;
END WHILE;
END
SELECT *
FROM JSON_TABLE(
CONCAT(
'["',
REPLACE(
"John;Elizabeth;Mark;Zagor;Annie;Lucy;Peter;Robin;Wilson;Tom;Bettie;Myriam;Frankie;Nick;Marilyn",
';',
'","'
),
'"]'
),
'$[*]' COLUMNS (
id FOR ORDINALITY,
name VARCHAR(255) PATH '$'
)
) jsontable;
https://dbfiddle.uk/?rdbms=mysql_8.0&fiddle=546907fc5c00b7173fa73327fdd97638
Insert it into the SP if needed.
This works too:
set #names = 'John;Elizabeth;Mark;Zagor;Annie;Lucy;Peter;Robin;Wilson;Tom;Bettie;Myriam;Frankie;Nick;Marilyn';
select
substring_index(substring_index(#names,';',R),';',-1) W
from (select row_number() over () as R
from information_schema.tables) x
where x.R<=1+length(#names)-length(replace(#names,';',''));
see: https://dbfiddle.uk/?rdbms=mysql_8.0&fiddle=5cc442be9da54d8cbcdbabc58ee37b65

Get data from comma separated string input

I have a stored procedure where input is a comma separated string say '12341,34567,12446,12997' and it is not sure that the input string always carries numerical data. It may be '12341,34as67,12$46,1we97' so I need to validate them and use only the valid data in query.
Say my query is (Where the column OrderCode is int type)
select * from dbo.DataCollector where OrderCode in (12341,34567,12446,12997)
or only the valid data if other are invalid
select * from dbo.DataCollector where OrderCode in (12341)
For such situation what would be a good solution.
One way that works also in SQl-Server 2005 would be to create a split-function, then you can use ISNUMERIC to check if it's a number:
DECLARE #Input VARCHAR(MAX) = '12341,34as67,12$46,1we97'
SELECT i.Item FROM dbo.Split(#Input, ',')i
WHERE IsNumeric(i.Item) = 1
Demo
Your complete query:
select * from dbo.DataCollector
where OrderCode in ( SELECT i.Item FROM dbo.Split(#Input, ',')i
WHERE IsNumeric(i.Item) = 1 )
Here is the split-function which i use:
CREATE FUNCTION [dbo].[Split]
(
#ItemList NVARCHAR(MAX),
#delimiter CHAR(1)
)
RETURNS #ItemTable TABLE (Item VARCHAR(250))
AS
BEGIN
DECLARE #tempItemList NVARCHAR(MAX)
SET #tempItemList = #ItemList
DECLARE #i INT
DECLARE #Item NVARCHAR(4000)
SET #i = CHARINDEX(#delimiter, #tempItemList)
WHILE (LEN(#tempItemList) > 0)
BEGIN
IF #i = 0
SET #Item = #tempItemList
ELSE
SET #Item = LEFT(#tempItemList, #i - 1)
INSERT INTO #ItemTable(Item) VALUES(#Item)
IF #i = 0
SET #tempItemList = ''
ELSE
SET #tempItemList = RIGHT(#tempItemList, LEN(#tempItemList) - #i)
SET #i = CHARINDEX(#delimiter, #tempItemList)
END
RETURN
END
Edit according to the comment of Damien that ISNUMERIC has it's issues. You can use this function to check if it's a real integer:
CREATE FUNCTION dbo.IsInteger(#Value VarChar(18))
RETURNS Bit
AS
BEGIN
RETURN IsNull(
(Select Case When CharIndex('.', #Value) > 0
Then Case When Convert(int, ParseName(#Value, 1)) <> 0
Then 0
Else 1
End
Else 1
End
Where IsNumeric(#Value + 'e0') = 1), 0)
END
Here is another example with damien's "bad" input which contains £ and 0d0:
Demo

T-SQL: split and aggregate comma-separated values

I have the following table with each row having comma-separated values:
ID
-----------------------------------------------------------------------------
10031,10042
10064,10023,10060,10065,10003,10011,10009,10012,10027,10004,10037,10039
10009
20011,10027,10032,10063,10023,10033,20060,10012,10020,10031,10011,20036,10041
I need to get a count for each ID (a groupby).
I am just trying to avoid cursor implementation and stumped on how to do this without cursors.
Any Help would be appreciated !
You will want to use a split function:
create FUNCTION [dbo].[Split](#String varchar(MAX), #Delimiter char(1))
returns #temptable TABLE (items varchar(MAX))
as
begin
declare #idx int
declare #slice varchar(8000)
select #idx = 1
if len(#String)<1 or #String is null return
while #idx!= 0
begin
set #idx = charindex(#Delimiter,#String)
if #idx!=0
set #slice = left(#String,#idx - 1)
else
set #slice = #String
if(len(#slice)>0)
insert into #temptable(Items) values(#slice)
set #String = right(#String,len(#String) - #idx)
if len(#String) = 0 break
end
return
end;
And then you can query the data in the following manner:
select items, count(items)
from table1 t1
cross apply dbo.split(t1.id, ',')
group by items
See SQL Fiddle With Demo
Well, the solution i always use, and probably there might be a better way, is to use a function that will split everything. No use for cursors, just a while loop.
if OBJECT_ID('splitValueByDelimiter') is not null
begin
drop function splitValueByDelimiter
end
go
create function splitValueByDelimiter (
#inputValue varchar(max)
, #delimiter varchar(1)
)
returns #results table (value varchar(max))
as
begin
declare #delimeterIndex int
, #tempValue varchar(max)
set #delimeterIndex = 1
while #delimeterIndex > 0 and len(isnull(#inputValue, '')) > 0
begin
set #delimeterIndex = charindex(#delimiter, #inputValue)
if #delimeterIndex > 0
set #tempValue = left(#inputValue, #delimeterIndex - 1)
else
set #tempValue = #inputValue
if(len(#tempValue)>0)
begin
insert
into #results
select #tempValue
end
set #inputValue = right(#inputValue, len(#inputValue) - #delimeterIndex)
end
return
end
After that you can call the output like this :
if object_id('test') is not null
begin
drop table test
end
go
create table test (
Id varchar(max)
)
insert
into test
select '10031,10042'
union all select '10064,10023,10060,10065,10003,10011,10009,10012,10027,10004,10037,10039'
union all select '10009'
union all select '20011,10027,10032,10063,10023,10033,20060,10012,10020,10031,10011,20036,10041'
select value
from test
cross apply splitValueByDelimiter(Id, ',')
Hope it helps, although i am still looping through everything
After reiterating the comment above about NOT putting multiple values into a single column (Use a separate child table with one value per row!),
Nevertheless, one possible approach: use a UDF to convert delimited string to a table. Once all the values have been converted to tables, combine all the tables into one table and do a group By on that table.
Create Function dbo.ParseTextString (#S Text, #delim VarChar(5))
Returns #tOut Table
(ValNum Integer Identity Primary Key,
sVal VarChar(8000))
As
Begin
Declare #dlLen TinyInt -- Length of delimiter
Declare #wind VarChar(8000) -- Will Contain Window into text string
Declare #winLen Integer -- Length of Window
Declare #isLastWin TinyInt -- Boolean to indicate processing Last Window
Declare #wPos Integer -- Start Position of Window within Text String
Declare #roVal VarChar(8000)-- String Data to insert into output Table
Declare #BtchSiz Integer -- Maximum Size of Window
Set #BtchSiz = 7900 -- (Reset to smaller values to test routine)
Declare #dlPos Integer -- Position within Window of next Delimiter
Declare #Strt Integer -- Start Position of each data value within Window
-- -------------------------------------------------------------------------
-- ---------------------------
If #delim is Null Set #delim = '|'
If DataLength(#S) = 0 Or
Substring(#S, 1, #BtchSiz) = #delim Return
-- --------------------------------------------
Select #dlLen = DataLength(#delim),
#Strt = 1, #wPos = 1,
#wind = Substring(#S, 1, #BtchSiz)
Select #winLen = DataLength(#wind),
#isLastWin = Case When DataLength(#wind) = #BtchSiz
Then 0 Else 1 End,
#dlPos = CharIndex(#delim, #wind, #Strt)
-- --------------------------------------------
While #Strt <= #winLen
Begin
If #dlPos = 0 Begin -- No More delimiters in window
If #isLastWin = 1 Set #dlPos = #winLen + 1
Else Begin
Set #wPos = #wPos + #Strt - 1
Set #wind = Substring(#S, #wPos, #BtchSiz)
-- ----------------------------------------
Select #winLen = DataLength(#wind), #Strt = 1,
#isLastWin = Case When DataLength(#wind) = #BtchSiz
Then 0 Else 1 End,
#dlPos = CharIndex(#delim, #wind, 1)
If #dlPos = 0 Set #dlPos = #winLen + 1
End
End
-- -------------------------------
Insert #tOut (sVal)
Select LTrim(Substring(#wind,
#Strt, #dlPos - #Strt))
-- -------------------------------
-- Move #Strt to char after last delimiter
Set #Strt = #dlPos + #dlLen
Set #dlPos = CharIndex(#delim, #wind, #Strt)
End
Return
End
Then write, (using your table schema),
Declare #AllVals VarChar(8000)
Select #AllVals = Coalesce(#allVals + ',', '') + ID
From Table Where ID Is Not null
-- -----------------------------------------
Select sVal, Count(*)
From dbo.ParseTextString(#AllVals, ',')
Group By sval

stored procedure for inserting comma seperated values in table using ms sql?

I have a table named assignRole.
I am passing string of userid (int) csv ,and passing roleid(int).
I want a stored procedure which split userid from string and take roleid and insert these values in table.this thing is to happen for all values in userid string.
First, create a function:
CREATE FUNCTION [dbo].[SplitInts]
(
#List VARCHAR(MAX),
#Delimiter CHAR(1)
)
RETURNS TABLE
WITH SCHEMABINDING
AS
RETURN ( SELECT Item = CONVERT(INT, Item) FROM (
SELECT Item = x.i.value('(./text())[1]', 'int') FROM (
SELECT [XML] = CONVERT(XML, '<i>' + REPLACE(#List, #Delimiter, '</i><i>')
+ '</i>').query('.') ) AS a CROSS APPLY [XML].nodes('i') AS x(i)) AS y
WHERE Item IS NOT NULL
);
Now you can say:
INSERT dbo.assignRole(RoleID, UserID)
SELECT #RoleID, Item
FROM dbo.SplitInts(#UserIDList, ',');
I like to use a table-valued function to do the split.
IF OBJECT_ID (N'dbo.StrSplit') IS NOT NULL DROP FUNCTION dbo.[StrSplit]
GO
CREATE FUNCTION [dbo].[StrSplit]
(
#String VARCHAR(MAX), #Delimiter char(1)
)
RETURNS
#Results TABLE (
Items NVARCHAR(MAX)
)
AS
BEGIN
DECLARE #INDEX INT
DECLARE #SLICE nvarchar(MAX)
-- HAVE TO SET TO 1 SO IT DOESNT EQUAL ZERO FIRST TIME IN LOOP
SELECT #INDEX = 1
-- Early exit if passed string is null
IF #String IS NULL RETURN
WHILE #INDEX !=0
BEGIN
-- GET THE INDEX OF THE FIRST OCCURENCE OF THE SPLIT CHARACTER
SELECT #INDEX = CHARINDEX(#Delimiter,#STRING)
-- NOW PUSH EVERYTHING TO THE LEFT OF IT INTO THE SLICE VARIABLE
IF #INDEX !=0
SELECT #SLICE = LEFT(#STRING,#INDEX - 1)
ELSE
SELECT #SLICE = #STRING
-- PUT THE ITEM INTO THE RESULTS SET
INSERT INTO #Results(Items) VALUES(#SLICE)
-- CHOP THE ITEM REMOVED OFF THE MAIN STRING
SELECT #STRING = RIGHT(#STRING,LEN(#STRING) - #INDEX)
-- BREAK OUT IF WE ARE DONE
IF LEN(#STRING) = 0 BREAK
END
RETURN
END
GO
GRANT SELECT ON [dbo].[StrSplit] TO [public]

SQL Server 2008 and HashBytes

I have quite a large nvarchar which I wish to pass to the HashBytes function.
I get the error:
"String or binary would be truncated.
Cannot insert the value NULL into
column 'colname', tbale 'table';
column does not allow nulls. UPDATE
fails. The statement has been
terminated."
Being ever resourceful, I discovered this was due to the HashBytes function having a maximum limit of 8000 bytes. Further searching showed me a 'solution' where my large varchar would be divided and hashed seperately and then later combined with this user defined function:
function [dbo].[udfLargeHashTable] (#algorithm nvarchar(4), #InputDataString varchar(MAX))
RETURNS varbinary(MAX)
AS
BEGIN
DECLARE
#Index int,
#InputDataLength int,
#ReturnSum varbinary(max),
#InputData varbinary(max)
SET #ReturnSum = 0
SET #Index = 1
SET #InputData = convert(binary,#InputDataString)
SET #InputDataLength = DATALENGTH(#InputData)
WHILE #Index <= #InputDataLength
BEGIN
SET #ReturnSum = #ReturnSum + HASHBYTES(#algorithm, SUBSTRING(#InputData, #Index, 8000))
SET #Index = #Index + 8000
END
RETURN #ReturnSum
END
which I call with:
set #ReportDefinitionHash=convert(int,dbo.[udfLargeHashTable]('SHA1',#ReportDefinitionForLookup))
Where #ReportDefinitionHash is int, and #ReportDefinitionForLookup is the varchar
Passing a simple char like 'test' produces a different int with my UDF than a normal call to HashBytes would produce.
Any advice on this issue?
If you can't create a function and have to use something that already exists in the DB:
sys.fn_repl_hash_binary
can be made to work using the syntax:
sys.fn_repl_hash_binary(cast('some really long string' as varbinary(max)))
Taken from: http://www.sqlnotes.info/2012/01/16/generate-md5-value-from-big-data/
Just use this function (taken from Hashing large data strings with a User Defined Function):
create function dbo.fn_hashbytesMAX
( #string nvarchar(max)
, #Algo varchar(10)
)
returns varbinary(20)
as
/************************************************************
*
* Author: Brandon Galderisi
* Last modified: 15-SEP-2009 (by Denis)
* Purpose: uses the system function hashbytes as well
* as sys.fn_varbintohexstr to split an
* nvarchar(max) string and hash in 8000 byte
* chunks hashing each 8000 byte chunk,,
* getting the 40 byte output, streaming each
* 40 byte output into a string then hashing
* that string.
*
*************************************************************/
begin
declare #concat nvarchar(max)
,#NumHash int
,#HASH varbinary(20)
set #NumHash = ceiling((datalength(#string)/2)/(4000.0))
/* HashBytes only supports 8000 bytes so split the string if it is larger */
if #NumHash>1
begin
-- # * 4000 character strings
;with a as (select 1 as n union all select 1) -- 2
,b as (select 1 as n from a ,a a1) -- 4
,c as (select 1 as n from b ,b b1) -- 16
,d as (select 1 as n from c ,c c1) -- 256
,e as (select 1 as n from d ,d d1) -- 65,536
,f as (select 1 as n from e ,e e1) -- 4,294,967,296 = 17+ TRILLION characters
,factored as (select row_number() over (order by n) rn from f)
,factors as (select rn,(rn*4000)+1 factor from factored)
select #concat = cast((
select right(sys.fn_varbintohexstr
(
hashbytes(#Algo, substring(#string, factor - 4000, 4000))
)
, 40) + ''
from Factors
where rn <= #NumHash
for xml path('')
) as nvarchar(max))
set #HASH = dbo.fn_hashbytesMAX(#concat ,#Algo)
end
else
begin
set #HASH = convert(varbinary(20), hashbytes(#Algo, #string))
end
return #HASH
end
And the results are as following:
select
hashbytes('sha1', N'test') --native function with nvarchar input
,hashbytes('sha1', 'test') --native function with varchar input
,dbo.fn_hashbytesMAX('test', 'sha1') --Galderisi's function which casts to nvarchar input
,dbo.fnGetHash('sha1', 'test') --your function
Output:
0x87F8ED9157125FFC4DA9E06A7B8011AD80A53FE1
0xA94A8FE5CCB19BA61C4C0873D391E987982FBBD3
0x87F8ED9157125FFC4DA9E06A7B8011AD80A53FE1
0x00000000AE6DBA4E0F767D06A97038B0C24ED720662ED9F1
I've taken the accepted answer, and modified it a bit with the following improvements:
no longer recursive function
now schema bound
no longer relying on undocumented stored procedures
two versions: one for nvarchar, one for varchar
returns same data size as HASHBYTES, leaving it up to the end user to convert to smaller based on algorithm used. This allows the functions to support future algorithms with larger data returns.
With these changes, the functions can now be used in persisted computed columns as they are now marked deterministic when created.
CREATE FUNCTION dbo.fnHashBytesNVARCHARMAX
(
#Algorithm VARCHAR(10),
#Text NVARCHAR(MAX)
)
RETURNS VARBINARY(8000)
WITH SCHEMABINDING
AS
BEGIN
DECLARE #NumHash INT;
DECLARE #HASH VARBINARY(8000);
SET #NumHash = CEILING(DATALENGTH(#Text) / (8000.0));
/* HashBytes only supports 8000 bytes so split the string if it is larger */
WHILE #NumHash > 1
BEGIN
-- # * 4000 character strings
WITH a AS
(SELECT 1 AS n UNION ALL SELECT 1), -- 2
b AS
(SELECT 1 AS n FROM a, a a1), -- 4
c AS
(SELECT 1 AS n FROM b, b b1), -- 16
d AS
(SELECT 1 AS n FROM c, c c1), -- 256
e AS
(SELECT 1 AS n FROM d, d d1), -- 65,536
f AS
(SELECT 1 AS n FROM e, e e1), -- 4,294,967,296 = 17+ TRILLION characters
factored AS
(SELECT ROW_NUMBER() OVER (ORDER BY n) rn FROM f),
factors AS
(SELECT rn, (rn * 4000) + 1 factor FROM factored)
SELECT #Text = CAST
(
(
SELECT CONVERT(VARCHAR(MAX), HASHBYTES(#Algorithm, SUBSTRING(#Text, factor - 4000, 4000)), 1)
FROM factors
WHERE rn <= #NumHash
FOR XML PATH('')
) AS NVARCHAR(MAX)
);
SET #NumHash = CEILING(DATALENGTH(#Text) / (8000.0));
END;
SET #HASH = CONVERT(VARBINARY(8000), HASHBYTES(#Algorithm, #Text));
RETURN #HASH;
END;
CREATE FUNCTION dbo.fnHashBytesVARCHARMAX
(
#Algorithm VARCHAR(10),
#Text VARCHAR(MAX)
)
RETURNS VARBINARY(8000)
WITH SCHEMABINDING
AS
BEGIN
DECLARE #NumHash INT;
DECLARE #HASH VARBINARY(8000);
SET #NumHash = CEILING(DATALENGTH(#Text) / (8000.0));
/* HashBytes only supports 8000 bytes so split the string if it is larger */
WHILE #NumHash > 1
BEGIN
-- # * 4000 character strings
WITH a AS
(SELECT 1 AS n UNION ALL SELECT 1), -- 2
b AS
(SELECT 1 AS n FROM a, a a1), -- 4
c AS
(SELECT 1 AS n FROM b, b b1), -- 16
d AS
(SELECT 1 AS n FROM c, c c1), -- 256
e AS
(SELECT 1 AS n FROM d, d d1), -- 65,536
f AS
(SELECT 1 AS n FROM e, e e1), -- 4,294,967,296 = 17+ TRILLION characters
factored AS
(SELECT ROW_NUMBER() OVER (ORDER BY n) rn FROM f),
factors AS
(SELECT rn, (rn * 8000) + 1 factor FROM factored)
SELECT #Text = CAST
(
(
SELECT CONVERT(VARCHAR(MAX), HASHBYTES(#Algorithm, SUBSTRING(#Text, factor - 8000, 8000)), 1)
FROM factors
WHERE rn <= #NumHash
FOR XML PATH('')
) AS NVARCHAR(MAX)
);
SET #NumHash = CEILING(DATALENGTH(#Text) / (8000.0));
END;
SET #HASH = CONVERT(VARBINARY(8000), HASHBYTES(#Algorithm, #Text));
RETURN #HASH;
END;
You could write a SQL CLR function:
[Microsoft.SqlServer.Server.SqlFunction]
public static SqlBinary BigHashBytes(SqlString algorithm, SqlString data)
{
var algo = HashAlgorithm.Create(algorithm.Value);
var bytes = Encoding.UTF8.GetBytes(data.Value);
return new SqlBinary(algo.ComputeHash(bytes));
}
And then it can be called in SQL like this:
--these return the same value
select HASHBYTES('md5', 'test stuff')
select dbo.BigHashBytes('md5', 'test stuff')
The BigHashBytes is only necessary if the length would be over 8k.
tested and working
select master.sys.fn_repl_hash_binary(someVarbinaryMaxValue)
moreover not complicated :)
This can be used as function body, too:
DECLARE #A NVARCHAR(MAX) = N'test'
DECLARE #res VARBINARY(MAX) = 0x
DECLARE #position INT = 1
,#len INT = DATALENGTH(#A)
WHILE 1 = 1
BEGIN
SET #res = #res + HASHBYTES('SHA2_256', SUBSTRING(#A, #position, 4000))
SET #position = #position+4000
IF #Position > #len
BREAK
END
SELECT HASHBYTES('SHA2_256',#res)
The idea si to HASH each 4000 part of the NVARCHAR(MAX) string and concatanate the results. Then to HASH the latter result.
It seems the easiest solution is to write a recursive hashing algorithm that parses the input text value into sub varchar(8000) segments.
I arbitrarily chose to slice the input string into 7500 character segments
The hashing algorithm returns a varbinary(20) which can easily be converted into a varchar(20)
ALTER FUNCTION [dbo].[BigHash]
(
#TextValue nvarchar(max)
)
RETURNS varbinary(20)
AS
BEGIN
if #TextValue = null
return hashbytes('SHA1', 'null')
Declare #FirstPart as varchar(7500)
Declare #Remainder as varchar(max)
Declare #RemainderHash as varbinary(20)
Declare #BinaryValue as varbinary(20)
Declare #TextLength as integer
Set #TextLength = len(#TextValue)
if #TextLength > 7500
Begin
Set #FirstPart = substring(#TextValue, 1, 7500)
Set #Remainder = substring(#TextValue, 7501, #TextLength - 7500)
Set #RemainderHash = dbo.BigHash(#Remainder)
Set #BinaryValue = hashbytes('SHA1', #FirstPart + convert( varchar(20), #RemainderHash, 2 ))
return #BinaryValue
End
else
Begin
Set #FirstPart = substring(#TextValue, 1, #TextLength)
Set #BinaryValue = hashbytes('SHA1', #FirstPart)
return #BinaryValue
End
return null
END