sql server 2008 not generating random value - sql-server-2008

I'm having a problem with random values being generated for each row in a result set in SQL Server 2008. I found a similar question here, but upon implementing the proposed answer, I saw the same problem as before. When running the query I have provided below, it seems that the same values will sometimes show up in consecutive rows, even though I'm calling for a new NEWID() with each row.
DECLARE #Id int = 0
DECLARE #Counter int = 1
DECLARE #Value int
CREATE TABLE #Table1
(
id int identity(1,1)
,Value int
)
WHILE #Counter < 100000
BEGIN
INSERT INTO #Table1 (Value)
SELECT CAST(RAND(CHECKSUM(NEWID())) * 100000 as INT)
SET #Counter += 1
END
SET #Counter = 0
WHILE #Counter < 5
BEGIN
SELECT
#Value = T.Value
,#Id = T.id
FROM #Table1 T
WHERE T.id = CAST(RAND(CHECKSUM(NEWID())) * 100000 as INT) + 1 + #Counter
IF #Id <> 0
SELECT #Value AS Value ,#Id as ID
SET #Counter += 1
END
DROP TABLE #Table1
If I change the INT to a BIGINT, as suggested in the link I provided, nothing is solved, so I don't believe that it's an "overflow" issue.

If I take the calculation out of the select, I don't get the doubled rows:
DECLARE #Id int = 0
DECLARE #Counter int = 1
DECLARE #Value int
-- new variable
DECLARE #DERIVED INT
CREATE TABLE #Table1
(
id int identity(1,1)
,Value int
)
WHILE #Counter < 100000
BEGIN
INSERT INTO #Table1 (Value)
SELECT CAST(RAND(CHECKSUM(NEWID())) * 100000 as INT)
SET #Counter += 1
END
SET #Counter = 0
WHILE #Counter < 5
BEGIN
--set here to remove calculation from the select
SET #DERIVED = CAST(RAND(CHECKSUM(NEWID())) * 100000 as INT) + 1 + #Counter;
SELECT
#Value = T.Value
,#Id = T.id
FROM #Table1 T
WHERE T.id = #DERIVED
IF #Id <> 0
SELECT #Value AS Value ,#Id as ID;
SET #Counter += 1
END
DROP TABLE #Table1
I'm seeing the duplicates every time with the pseudorandom generator inside the select. Oddly enough, I get about the same frequency of duplicates on the insert loop whether or not the calculation is inside the insert... select. It could be coincidence, since we are dealing with a randomly selected number. Note also, that since you're adding to the pseudorandom result, the results aren't technically duplicates. They're descending sequences:
11111 + 1 + 1 = 11113
11110 + 1 + 2 = 11113
Same overall result, different pseudorandom result. However, if I change
CAST(RAND(CHECKSUM(NEWID())) * 100000 as INT) + 1 + #Counter
to
CAST(RAND(CHECKSUM(NEWID())) * 100000 as INT) + #Counter + #Counter
I still consistently get duplicates. That implies that the optimizer may be caching/re-using values, at least on the select. I'd call that improper for a non-deterministic function call. I get similar results on 10.0.1600 and 10.50.1600 (2008 RTM and 2008R2 RTM).

Related

T-SQL: split and aggregate comma-separated values

I have the following table with each row having comma-separated values:
ID
-----------------------------------------------------------------------------
10031,10042
10064,10023,10060,10065,10003,10011,10009,10012,10027,10004,10037,10039
10009
20011,10027,10032,10063,10023,10033,20060,10012,10020,10031,10011,20036,10041
I need to get a count for each ID (a groupby).
I am just trying to avoid cursor implementation and stumped on how to do this without cursors.
Any Help would be appreciated !
You will want to use a split function:
create FUNCTION [dbo].[Split](#String varchar(MAX), #Delimiter char(1))
returns #temptable TABLE (items varchar(MAX))
as
begin
declare #idx int
declare #slice varchar(8000)
select #idx = 1
if len(#String)<1 or #String is null return
while #idx!= 0
begin
set #idx = charindex(#Delimiter,#String)
if #idx!=0
set #slice = left(#String,#idx - 1)
else
set #slice = #String
if(len(#slice)>0)
insert into #temptable(Items) values(#slice)
set #String = right(#String,len(#String) - #idx)
if len(#String) = 0 break
end
return
end;
And then you can query the data in the following manner:
select items, count(items)
from table1 t1
cross apply dbo.split(t1.id, ',')
group by items
See SQL Fiddle With Demo
Well, the solution i always use, and probably there might be a better way, is to use a function that will split everything. No use for cursors, just a while loop.
if OBJECT_ID('splitValueByDelimiter') is not null
begin
drop function splitValueByDelimiter
end
go
create function splitValueByDelimiter (
#inputValue varchar(max)
, #delimiter varchar(1)
)
returns #results table (value varchar(max))
as
begin
declare #delimeterIndex int
, #tempValue varchar(max)
set #delimeterIndex = 1
while #delimeterIndex > 0 and len(isnull(#inputValue, '')) > 0
begin
set #delimeterIndex = charindex(#delimiter, #inputValue)
if #delimeterIndex > 0
set #tempValue = left(#inputValue, #delimeterIndex - 1)
else
set #tempValue = #inputValue
if(len(#tempValue)>0)
begin
insert
into #results
select #tempValue
end
set #inputValue = right(#inputValue, len(#inputValue) - #delimeterIndex)
end
return
end
After that you can call the output like this :
if object_id('test') is not null
begin
drop table test
end
go
create table test (
Id varchar(max)
)
insert
into test
select '10031,10042'
union all select '10064,10023,10060,10065,10003,10011,10009,10012,10027,10004,10037,10039'
union all select '10009'
union all select '20011,10027,10032,10063,10023,10033,20060,10012,10020,10031,10011,20036,10041'
select value
from test
cross apply splitValueByDelimiter(Id, ',')
Hope it helps, although i am still looping through everything
After reiterating the comment above about NOT putting multiple values into a single column (Use a separate child table with one value per row!),
Nevertheless, one possible approach: use a UDF to convert delimited string to a table. Once all the values have been converted to tables, combine all the tables into one table and do a group By on that table.
Create Function dbo.ParseTextString (#S Text, #delim VarChar(5))
Returns #tOut Table
(ValNum Integer Identity Primary Key,
sVal VarChar(8000))
As
Begin
Declare #dlLen TinyInt -- Length of delimiter
Declare #wind VarChar(8000) -- Will Contain Window into text string
Declare #winLen Integer -- Length of Window
Declare #isLastWin TinyInt -- Boolean to indicate processing Last Window
Declare #wPos Integer -- Start Position of Window within Text String
Declare #roVal VarChar(8000)-- String Data to insert into output Table
Declare #BtchSiz Integer -- Maximum Size of Window
Set #BtchSiz = 7900 -- (Reset to smaller values to test routine)
Declare #dlPos Integer -- Position within Window of next Delimiter
Declare #Strt Integer -- Start Position of each data value within Window
-- -------------------------------------------------------------------------
-- ---------------------------
If #delim is Null Set #delim = '|'
If DataLength(#S) = 0 Or
Substring(#S, 1, #BtchSiz) = #delim Return
-- --------------------------------------------
Select #dlLen = DataLength(#delim),
#Strt = 1, #wPos = 1,
#wind = Substring(#S, 1, #BtchSiz)
Select #winLen = DataLength(#wind),
#isLastWin = Case When DataLength(#wind) = #BtchSiz
Then 0 Else 1 End,
#dlPos = CharIndex(#delim, #wind, #Strt)
-- --------------------------------------------
While #Strt <= #winLen
Begin
If #dlPos = 0 Begin -- No More delimiters in window
If #isLastWin = 1 Set #dlPos = #winLen + 1
Else Begin
Set #wPos = #wPos + #Strt - 1
Set #wind = Substring(#S, #wPos, #BtchSiz)
-- ----------------------------------------
Select #winLen = DataLength(#wind), #Strt = 1,
#isLastWin = Case When DataLength(#wind) = #BtchSiz
Then 0 Else 1 End,
#dlPos = CharIndex(#delim, #wind, 1)
If #dlPos = 0 Set #dlPos = #winLen + 1
End
End
-- -------------------------------
Insert #tOut (sVal)
Select LTrim(Substring(#wind,
#Strt, #dlPos - #Strt))
-- -------------------------------
-- Move #Strt to char after last delimiter
Set #Strt = #dlPos + #dlLen
Set #dlPos = CharIndex(#delim, #wind, #Strt)
End
Return
End
Then write, (using your table schema),
Declare #AllVals VarChar(8000)
Select #AllVals = Coalesce(#allVals + ',', '') + ID
From Table Where ID Is Not null
-- -----------------------------------------
Select sVal, Count(*)
From dbo.ParseTextString(#AllVals, ',')
Group By sval

MySQL limit by sum

I want to limit my SELECT results in mySQL by sum.
For Example, this is my table:
(id, val)
Data Entries:
(1,100),
(2,300),
(3,50),
(4,3000)
I want to select first k entries such that the sum of val in those entries is just enough to make it to M.
For example, I want to find entries such that M = 425.
The result should be (1,100),(2,300),(3,50).
How can I do that in a mysql select query?
Try this variant -
SET #sum = 0;
SELECT id, val FROM (
SELECT *, #sum:=#sum + val mysum FROM mytable2 ORDER BY id
) t
WHERE mysum <= 450;
+------+------+
| id | val |
+------+------+
| 1 | 100 |
| 2 | 300 |
| 3 | 50 |
+------+------+
This stored procedure might help:
DELIMITER ;;
CREATE PROCEDURE selectLimitBySum (IN m INT)
BEGIN
DECLARE mTmp INT DEFAULT 0;
DECLARE idTmp INT DEFAULT 0;
DECLARE valTmp INT DEFAULT 0;
DECLARE doneLoop SMALLINT DEFAULT 0;
DECLARE crsSelect CURSOR FOR SELECT id, val FROM test3;
DECLARE CONTINUE HANDLER FOR NOT FOUND SET doneLoop = 1;
OPEN crsSelect;
aloop: LOOP
SET idTmp = 0;
SET valTmp = 0;
FETCH crsSelect INTO idTmp, valTmp;
if doneLoop THEN
LEAVE aloop;
END IF;
SELECT idTmp, valTmp;
SET mTmp = mTmp + valTmp;
if mTmp > m THEN
LEAVE aloop;
END IF;
END LOOP;
CLOSE crsSelect;
END ;;
DELIMITER ;
Please feel free to change the table names or variable names as per your needs.
from mysql reference manual:
The LIMIT clause can be used to constrain the number of rows returned by the SELECT statement. LIMIT takes one or two numeric arguments, which must both be nonnegative integer constants (except when using prepared statements).
So you cannot use limit the way you proposed. To achieve what you want you need to use your application (java, c, php or whatever else), read the result set row by row, and stop when your condition is reached.
or you can use a prepared statement, but anyway you cant have conditional limit (it must be a constant value) and it is not exactly what you asked for.
create table #limit(
id int,
val int
)
declare #sum int, #id int, #val int, #m int;
set #sum=0;
set #m=250; --Value of an entry
declare limit_cursor cursor for
select id, val from your_table order by id
open limit_cursor
fetch next from limit_cursor into #id, #val
while(##fetch_status=0)
begin
if(#sum<#m)
begin
set #sum = #sum+#val;
INSERT INTO #limit values (#id, #val);
fetch next from limit_cursor into #id, #val
end
else
begin
goto case1;
end
end
case1:
close limit_cursor
deallocate limit_cursor
select * from #limit
truncate table #limit

How to loop the data and if username exists append incremental number e.g. JOHSMI1 or JOHSMI2

I have a userid table
UserId
JHOSMI
KALVIE
etc...
What I would like to do is create a select statement and pass user id, if the userid already exists then append 1 to the id, This gets complicated if you already have JHOSMI, JHOSMI1, then I want to return JHOSMI2.
Really appreciate help here.
Thanks in advance
edited 21-Jul
this is what i got so far.. but not working the way
select #p AS StaffID,
#old_p := #p,
#Cnt := #Cnt+1 As Lvl,
(SELECT #p :=Concat(#i, #Cnt)
FROM departmenttaff
WHERE upper(trim(UserId)) = upper(trim(StaffID))
AND upper(trim(department)) like upper(trim('SERVICE'))
) AS dummy
FROM (
SELECT
#i := upper(trim('JOHSMI')),
#p := upper(trim('JOHSMI')),
#old_p :='',
#Cnt:=0
) vars,
departmenttaff p
WHERE #p <> #old_p
order by Lvl Desc LIMIT 1;
This will do exactly what you want. You will need a unique constraint on your column.
You might also need to add in error code if success = 0.
This is in MSSQL, you will need to add the relevant commands for MySQL. I do not have MySQL so I cannot test it.
NOTE: You can replace the try catch with some IF EXISTS logic. I just prefer the try catch because its more stable for multiple threads.
begin tran
select * from #tmp
declare #success bit
declare #name varchar(50)
declare #newname varchar(50)
declare #nextid int
declare #attempts int
set #name = 'brad2something'
set #success = 0
set #attempts = 0
while #success = 0 and #attempts < 5 begin
begin try
set #attempts = #attempts + 1 -- failsafe
set #newname = #name
if exists (select * from #tmp where username = #name) begin
select #nextid = isnull(max(convert(int, substring(username, LEN(#name) + 1, 50))), 0) + 1
from #tmp where username like #name + '%' and isnumeric(substring(username, LEN(#name) + 1, 50)) = 1
set #newname = #name + CONVERT(varchar(20), #nextid)
end
insert into #tmp (username) values (#newname)
set #success = 1
end try begin catch end catch
end
--insert into #tmp (username)
--select
select #success
select * from #tmp
rollback
/*
drop table #tmp
create table #tmp (
username varchar(50) not null unique
)
insert into #tmp (username)
select 'brad'
union all select 'brad1'
union all select 'brad2something5'
union all select 'brad2'
union all select 'laney'
union all select 'laney500'
*/
I noticed you want to back fill data. If you want to back fill then this will work. It is extremely inefficient but there is no way around it. There is optimizing code you can put in for when an "error" occurs to prevent all previous counts from happening, but this will work.
begin tran
select * from #tmp
declare #success bit
declare #name varchar(50)
declare #newname varchar(50)
declare #nextid int
declare #attempts int
set #name = 'laney'
set #success = 0
set #attempts = 0
set #nextid = 1
while #success = 0 and #attempts < 5 begin
begin try
if exists (select * from #tmp where username = #name) begin
set #newname = #name + CONVERT(varchar(20), #nextid)
while exists (select * from #tmp where username = #newname) begin
set #nextid = #nextid + 1
set #newname = #name + CONVERT(varchar(20), #nextid)
end
end else
set #newname = #name
set #attempts = #attempts + 1 -- failsafe
insert into #tmp (username) values (#newname)
set #success = 1
end try begin catch end catch
end
--insert into #tmp (username)
--select
select #success
select * from #tmp
rollback
/*
drop table #tmp
create table #tmp (
username varchar(50) not null unique
)
insert into #tmp (username)
select 'brad'
union all select 'brad1'
union all select 'brad2something5'
union all select 'brad2'
union all select 'laney'
union all select 'laney500'
*/
Is it mandatory to have the count in same column? its better to have it in a different integer column. Anyways, if this is the requirement then select userid from table where userid like 'JHOSMI%', then do extract the number using mysql substr function.
For other people who might find this, here's a version in PostgreSQL:
create or replace function uniquify_username(varchar) returns varchar as $$
select $1 || coalesce((max(num) + 1)::varchar, '')
from
(select
substring(name, '^(.*?)[0-9]*$') as prefix,
coalesce(substring(name, '.*([0-9]+)$'), '0')::integer as num
from user1) users
where prefix = $1
$$ LANGUAGE sql;
I think it could be adapted to MySQL (though probably not as a stored procedure) but I don't have a MySQL server handy to do the conversion on.
Put a UNIQUE constraint on the column.
You didn't say what language you are using, so use this pseudo code
counter = 0
finished = false
while finished = false
{
try
{
if counter >= 1 then name = name + counter
counter = counter + 1
insert into table (name)
}
}
This code is extremely finicky. But will get the job done and there is no real other way to do this except for in sql, and you will always have some type of try catch to avoid two processes running at the same time. This way you use the unique key constraint to force the error, and supress it because it is expected.
I in no way condone using try/catch for business logic like this, but you are putting yourself in a situation thats unavoidable. I would say put the ID in a seperate column and make a unique constraint on both fields.
Proper solution:
Columns: Name, ID, Display Name
Unique constraint on: Name, ID
Display Name is a computed column (virtual) is Name + ID
If you do it this way, then all you have to do is INSERT INTO table (name, (select max() from table))

Handling larger strings in Sql Server 2008

We have an stored procedure that we created so that user can write comma separated search tags in their software product's admin. So he can add comma-separated tags and in case if he wants to edit them, we read from the table all the tags, recreate them as comma-separated values (CSV) in stored procedure and returns that to the calling code. What happened recently, the user complained that he could not see the new CSVs he wrote. I looked into it and found out that the stored procedure is truncating the string when it reads values from database and creates CSV string. The string is of type nvarchar, and because its exceeding the max characters of 4000 limit, the values gets truncated. Any ideas on how to work out that problem.
Find my code underneath.
BEGIN
BEGIN
Declare #Synonyms Table
(
RowID int Identity(1,1),
SynonymID int,
[Synonym] nvarchar(4000)
);
SET NOCOUNT ON;
Insert #Synonyms(SynonymID, [Synonym])
Select distinct SynonymID, [Synonym] From RF_SearchSynonyms with(nolock) Where SearchTermID = #SearchTermID And ActiveInd = 1
If((Select COUNT(RowID) From #Synonyms) <> 0)
BEGIN
Declare #CurrentRow int = (Select MIN(RowID) From #Synonyms),
#TotalRows int = (Select MAX(RowID) From #Synonyms),
#Synonyms_CSV nvarchar(4000) = '';
WHILE #CurrentRow <= #TotalRows
BEGIN
Declare #TempSyn nvarchar(500);
Select #TempSyn = [Synonym] + ',' From #Synonyms Where RowID = #CurrentRow;
Set #Synonyms_CSV = #Synonyms_CSV + LTRIM(RTRIM(LOWER(#TempSyn)));
SET #CurrentRow = #CurrentRow + 1
END
END
Else
BEGIN
Set #Synonyms_CSV = '';
END
END
BEGIN
Declare #SKUs Table
(
RowID int Identity(1,1),
SkuID int,
SKU nvarchar(15)
);
SET NOCOUNT ON;
Insert #SKUs(SkuID, SKU)
Select distinct SkuID, SKU From RF_SearchSkus with(nolock) Where SearchTermID = #SearchTermID And ActiveInd = 1
If((Select COUNT(RowID) From #SKUs) <> 0)
BEGIN
Declare #CurrentRow1 int = (Select MIN(RowID) From #SKUs),
#TotalRows1 int = (Select MAX(RowID) From #SKUs),
#Skus_CSV nvarchar(4000) = '';
WHILE #CurrentRow1 <= #TotalRows1
BEGIN
Declare #TempSku nvarchar(15);
Select #TempSku = SKU + ',' From #SKUs Where RowID = #CurrentRow1;
Set #Skus_CSV = #Skus_CSV + LTRIM(RTRIM(#TempSku));
SET #CurrentRow1 = #CurrentRow1 + 1
END
END
Else
BEGIN
Set #Skus_CSV = '';
END
END
BEGIN
Declare #Combined varchar(8000),
#syn_len int = 0,
#sku_len int = 0;
Select #syn_len = LEN(#Synonyms_CSV);
Select #sku_len = LEN(#Skus_CSV);
Select #Combined = #Synonyms_CSV + '-_-' + #Skus_CSV;
Select #Synonyms_CSV + '-_-' + #Skus_CSV;
END
END
I can't use text and ntext as they do not play nice with concatenation operations.
Thanks.
How are your declaring the string parameter?
nvarchar(max)
supports up to 2^32-1 (2GB)
See this link.

SQL Server 2008 and HashBytes

I have quite a large nvarchar which I wish to pass to the HashBytes function.
I get the error:
"String or binary would be truncated.
Cannot insert the value NULL into
column 'colname', tbale 'table';
column does not allow nulls. UPDATE
fails. The statement has been
terminated."
Being ever resourceful, I discovered this was due to the HashBytes function having a maximum limit of 8000 bytes. Further searching showed me a 'solution' where my large varchar would be divided and hashed seperately and then later combined with this user defined function:
function [dbo].[udfLargeHashTable] (#algorithm nvarchar(4), #InputDataString varchar(MAX))
RETURNS varbinary(MAX)
AS
BEGIN
DECLARE
#Index int,
#InputDataLength int,
#ReturnSum varbinary(max),
#InputData varbinary(max)
SET #ReturnSum = 0
SET #Index = 1
SET #InputData = convert(binary,#InputDataString)
SET #InputDataLength = DATALENGTH(#InputData)
WHILE #Index <= #InputDataLength
BEGIN
SET #ReturnSum = #ReturnSum + HASHBYTES(#algorithm, SUBSTRING(#InputData, #Index, 8000))
SET #Index = #Index + 8000
END
RETURN #ReturnSum
END
which I call with:
set #ReportDefinitionHash=convert(int,dbo.[udfLargeHashTable]('SHA1',#ReportDefinitionForLookup))
Where #ReportDefinitionHash is int, and #ReportDefinitionForLookup is the varchar
Passing a simple char like 'test' produces a different int with my UDF than a normal call to HashBytes would produce.
Any advice on this issue?
If you can't create a function and have to use something that already exists in the DB:
sys.fn_repl_hash_binary
can be made to work using the syntax:
sys.fn_repl_hash_binary(cast('some really long string' as varbinary(max)))
Taken from: http://www.sqlnotes.info/2012/01/16/generate-md5-value-from-big-data/
Just use this function (taken from Hashing large data strings with a User Defined Function):
create function dbo.fn_hashbytesMAX
( #string nvarchar(max)
, #Algo varchar(10)
)
returns varbinary(20)
as
/************************************************************
*
* Author: Brandon Galderisi
* Last modified: 15-SEP-2009 (by Denis)
* Purpose: uses the system function hashbytes as well
* as sys.fn_varbintohexstr to split an
* nvarchar(max) string and hash in 8000 byte
* chunks hashing each 8000 byte chunk,,
* getting the 40 byte output, streaming each
* 40 byte output into a string then hashing
* that string.
*
*************************************************************/
begin
declare #concat nvarchar(max)
,#NumHash int
,#HASH varbinary(20)
set #NumHash = ceiling((datalength(#string)/2)/(4000.0))
/* HashBytes only supports 8000 bytes so split the string if it is larger */
if #NumHash>1
begin
-- # * 4000 character strings
;with a as (select 1 as n union all select 1) -- 2
,b as (select 1 as n from a ,a a1) -- 4
,c as (select 1 as n from b ,b b1) -- 16
,d as (select 1 as n from c ,c c1) -- 256
,e as (select 1 as n from d ,d d1) -- 65,536
,f as (select 1 as n from e ,e e1) -- 4,294,967,296 = 17+ TRILLION characters
,factored as (select row_number() over (order by n) rn from f)
,factors as (select rn,(rn*4000)+1 factor from factored)
select #concat = cast((
select right(sys.fn_varbintohexstr
(
hashbytes(#Algo, substring(#string, factor - 4000, 4000))
)
, 40) + ''
from Factors
where rn <= #NumHash
for xml path('')
) as nvarchar(max))
set #HASH = dbo.fn_hashbytesMAX(#concat ,#Algo)
end
else
begin
set #HASH = convert(varbinary(20), hashbytes(#Algo, #string))
end
return #HASH
end
And the results are as following:
select
hashbytes('sha1', N'test') --native function with nvarchar input
,hashbytes('sha1', 'test') --native function with varchar input
,dbo.fn_hashbytesMAX('test', 'sha1') --Galderisi's function which casts to nvarchar input
,dbo.fnGetHash('sha1', 'test') --your function
Output:
0x87F8ED9157125FFC4DA9E06A7B8011AD80A53FE1
0xA94A8FE5CCB19BA61C4C0873D391E987982FBBD3
0x87F8ED9157125FFC4DA9E06A7B8011AD80A53FE1
0x00000000AE6DBA4E0F767D06A97038B0C24ED720662ED9F1
I've taken the accepted answer, and modified it a bit with the following improvements:
no longer recursive function
now schema bound
no longer relying on undocumented stored procedures
two versions: one for nvarchar, one for varchar
returns same data size as HASHBYTES, leaving it up to the end user to convert to smaller based on algorithm used. This allows the functions to support future algorithms with larger data returns.
With these changes, the functions can now be used in persisted computed columns as they are now marked deterministic when created.
CREATE FUNCTION dbo.fnHashBytesNVARCHARMAX
(
#Algorithm VARCHAR(10),
#Text NVARCHAR(MAX)
)
RETURNS VARBINARY(8000)
WITH SCHEMABINDING
AS
BEGIN
DECLARE #NumHash INT;
DECLARE #HASH VARBINARY(8000);
SET #NumHash = CEILING(DATALENGTH(#Text) / (8000.0));
/* HashBytes only supports 8000 bytes so split the string if it is larger */
WHILE #NumHash > 1
BEGIN
-- # * 4000 character strings
WITH a AS
(SELECT 1 AS n UNION ALL SELECT 1), -- 2
b AS
(SELECT 1 AS n FROM a, a a1), -- 4
c AS
(SELECT 1 AS n FROM b, b b1), -- 16
d AS
(SELECT 1 AS n FROM c, c c1), -- 256
e AS
(SELECT 1 AS n FROM d, d d1), -- 65,536
f AS
(SELECT 1 AS n FROM e, e e1), -- 4,294,967,296 = 17+ TRILLION characters
factored AS
(SELECT ROW_NUMBER() OVER (ORDER BY n) rn FROM f),
factors AS
(SELECT rn, (rn * 4000) + 1 factor FROM factored)
SELECT #Text = CAST
(
(
SELECT CONVERT(VARCHAR(MAX), HASHBYTES(#Algorithm, SUBSTRING(#Text, factor - 4000, 4000)), 1)
FROM factors
WHERE rn <= #NumHash
FOR XML PATH('')
) AS NVARCHAR(MAX)
);
SET #NumHash = CEILING(DATALENGTH(#Text) / (8000.0));
END;
SET #HASH = CONVERT(VARBINARY(8000), HASHBYTES(#Algorithm, #Text));
RETURN #HASH;
END;
CREATE FUNCTION dbo.fnHashBytesVARCHARMAX
(
#Algorithm VARCHAR(10),
#Text VARCHAR(MAX)
)
RETURNS VARBINARY(8000)
WITH SCHEMABINDING
AS
BEGIN
DECLARE #NumHash INT;
DECLARE #HASH VARBINARY(8000);
SET #NumHash = CEILING(DATALENGTH(#Text) / (8000.0));
/* HashBytes only supports 8000 bytes so split the string if it is larger */
WHILE #NumHash > 1
BEGIN
-- # * 4000 character strings
WITH a AS
(SELECT 1 AS n UNION ALL SELECT 1), -- 2
b AS
(SELECT 1 AS n FROM a, a a1), -- 4
c AS
(SELECT 1 AS n FROM b, b b1), -- 16
d AS
(SELECT 1 AS n FROM c, c c1), -- 256
e AS
(SELECT 1 AS n FROM d, d d1), -- 65,536
f AS
(SELECT 1 AS n FROM e, e e1), -- 4,294,967,296 = 17+ TRILLION characters
factored AS
(SELECT ROW_NUMBER() OVER (ORDER BY n) rn FROM f),
factors AS
(SELECT rn, (rn * 8000) + 1 factor FROM factored)
SELECT #Text = CAST
(
(
SELECT CONVERT(VARCHAR(MAX), HASHBYTES(#Algorithm, SUBSTRING(#Text, factor - 8000, 8000)), 1)
FROM factors
WHERE rn <= #NumHash
FOR XML PATH('')
) AS NVARCHAR(MAX)
);
SET #NumHash = CEILING(DATALENGTH(#Text) / (8000.0));
END;
SET #HASH = CONVERT(VARBINARY(8000), HASHBYTES(#Algorithm, #Text));
RETURN #HASH;
END;
You could write a SQL CLR function:
[Microsoft.SqlServer.Server.SqlFunction]
public static SqlBinary BigHashBytes(SqlString algorithm, SqlString data)
{
var algo = HashAlgorithm.Create(algorithm.Value);
var bytes = Encoding.UTF8.GetBytes(data.Value);
return new SqlBinary(algo.ComputeHash(bytes));
}
And then it can be called in SQL like this:
--these return the same value
select HASHBYTES('md5', 'test stuff')
select dbo.BigHashBytes('md5', 'test stuff')
The BigHashBytes is only necessary if the length would be over 8k.
tested and working
select master.sys.fn_repl_hash_binary(someVarbinaryMaxValue)
moreover not complicated :)
This can be used as function body, too:
DECLARE #A NVARCHAR(MAX) = N'test'
DECLARE #res VARBINARY(MAX) = 0x
DECLARE #position INT = 1
,#len INT = DATALENGTH(#A)
WHILE 1 = 1
BEGIN
SET #res = #res + HASHBYTES('SHA2_256', SUBSTRING(#A, #position, 4000))
SET #position = #position+4000
IF #Position > #len
BREAK
END
SELECT HASHBYTES('SHA2_256',#res)
The idea si to HASH each 4000 part of the NVARCHAR(MAX) string and concatanate the results. Then to HASH the latter result.
It seems the easiest solution is to write a recursive hashing algorithm that parses the input text value into sub varchar(8000) segments.
I arbitrarily chose to slice the input string into 7500 character segments
The hashing algorithm returns a varbinary(20) which can easily be converted into a varchar(20)
ALTER FUNCTION [dbo].[BigHash]
(
#TextValue nvarchar(max)
)
RETURNS varbinary(20)
AS
BEGIN
if #TextValue = null
return hashbytes('SHA1', 'null')
Declare #FirstPart as varchar(7500)
Declare #Remainder as varchar(max)
Declare #RemainderHash as varbinary(20)
Declare #BinaryValue as varbinary(20)
Declare #TextLength as integer
Set #TextLength = len(#TextValue)
if #TextLength > 7500
Begin
Set #FirstPart = substring(#TextValue, 1, 7500)
Set #Remainder = substring(#TextValue, 7501, #TextLength - 7500)
Set #RemainderHash = dbo.BigHash(#Remainder)
Set #BinaryValue = hashbytes('SHA1', #FirstPart + convert( varchar(20), #RemainderHash, 2 ))
return #BinaryValue
End
else
Begin
Set #FirstPart = substring(#TextValue, 1, #TextLength)
Set #BinaryValue = hashbytes('SHA1', #FirstPart)
return #BinaryValue
End
return null
END