Stripping HTML from text in SQL Server–Version 3

This creates a function then call it in the sql design view like this DeHtmlize (Field Name)

I’ve used the HTML stripping function for SQL Server available in lazycoders.blogspot.com, which is the second version of the originally published in blog.sqlauthority.com. But neither one removes the comments in this case:

<!-- <b>hello world</b> --> Hello

which is more or less the code that MS Word generates.

Well, the function with that fixed is this (changes are in bold):

This one seems better.  It replaces <br> with CHAR(10)CHAR(13).

ALTER FUNCTION [dbo].[DeHtmlize]
(
@HTMLText varchar(MAX)
)
RETURNS varchar(MAX)
AS
BEGIN
DECLARE @Start  int
DECLARE @End    int
DECLARE @Length int

-- Replace the HTML entity &amp; with the '&' character (this needs to be done first, as
-- '&' might be double encoded as '&amp;amp;')
SET @Start = CHARINDEX('&amp;', @HTMLText)
SET @End = @Start + 4
SET @Length = (@End - @Start) + 1

WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, '&')
SET @Start = CHARINDEX('&amp;', @HTMLText)
SET @End = @Start + 4
SET @Length = (@End - @Start) + 1
END

-- Replace the HTML entity &lt; with the '<' character
SET @Start = CHARINDEX('&lt;', @HTMLText)
SET @End = @Start + 3
SET @Length = (@End - @Start) + 1

WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, '<')
SET @Start = CHARINDEX('&lt;', @HTMLText)
SET @End = @Start + 3
SET @Length = (@End - @Start) + 1
END

-- Replace the HTML entity &gt; with the '>' character
SET @Start = CHARINDEX('&gt;', @HTMLText)
SET @End = @Start + 3
SET @Length = (@End - @Start) + 1

WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, '>')
SET @Start = CHARINDEX('&gt;', @HTMLText)
SET @End = @Start + 3
SET @Length = (@End - @Start) + 1
END

-- Replace the HTML entity &amp; with the '&' character
SET @Start = CHARINDEX('&amp;amp;', @HTMLText)
SET @End = @Start + 4
SET @Length = (@End - @Start) + 1

WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, '&')
SET @Start = CHARINDEX('&amp;amp;', @HTMLText)
SET @End = @Start + 4
SET @Length = (@End - @Start) + 1
END

-- Replace the HTML entity &nbsp; with the ' ' character
SET @Start = CHARINDEX('&nbsp;', @HTMLText)
SET @End = @Start + 5
SET @Length = (@End - @Start) + 1

WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, ' ')
SET @Start = CHARINDEX('&nbsp;', @HTMLText)
SET @End = @Start + 5
SET @Length = (@End - @Start) + 1
END

-- Replace any <br> tags with a newline
SET @Start = CHARINDEX('<br>', @HTMLText)
SET @End = @Start + 3
SET @Length = (@End - @Start) + 1

WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, CHAR(13) + CHAR(10))
SET @Start = CHARINDEX('<br>', @HTMLText)
SET @End = @Start + 3
SET @Length = (@End - @Start) + 1
END

-- Replace any <br/> tags with a newline
SET @Start = CHARINDEX('<br/>', @HTMLText)
SET @End = @Start + 4
SET @Length = (@End - @Start) + 1

WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, 'CHAR(13) + CHAR(10)')
SET @Start = CHARINDEX('<br/>', @HTMLText)
SET @End = @Start + 4
SET @Length = (@End - @Start) + 1
END

-- Replace any <br /> tags with a newline
SET @Start = CHARINDEX('<br />', @HTMLText)
SET @End = @Start + 5
SET @Length = (@End - @Start) + 1

WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, 'CHAR(13) + CHAR(10)')
SET @Start = CHARINDEX('<br />', @HTMLText)
SET @End = @Start + 5
SET @Length = (@End - @Start) + 1
END

-- Remove anything between comments <!-- -->
SET @Start = CHARINDEX('<!--', @HTMLText)
SET @End = CHARINDEX('-->', @HTMLText, CHARINDEX('<!--', @HTMLText))
SET @Length = (@End - @Start) + 3

WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, '')
SET @Start = CHARINDEX('<!--', @HTMLText)
SET @End = CHARINDEX('-->', @HTMLText, CHARINDEX('<!--', @HTMLText))
SET @Length = (@End - @Start) + 3
END

-- Remove anything between <whatever> tags
SET @Start = CHARINDEX('<', @HTMLText)
SET @End = CHARINDEX('>', @HTMLText, CHARINDEX('<', @HTMLText))
SET @Length = (@End - @Start) + 1

WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, '')
SET @Start = CHARINDEX('<', @HTMLText)
SET @End = CHARINDEX('>', @HTMLText, CHARINDEX('<', @HTMLText))
SET @Length = (@End - @Start) + 1
END

RETURN LTRIM(RTRIM(@HTMLText))

END

GO

Hope this helps. Enjoy!

By Diego Jancic en 1:27 PM

Second Function

ALTER FUNCTION [dbo].[udf_StripHTML]

(

@HTMLText varchar(MAX)

)

RETURNS varchar(MAX)

AS

BEGIN

DECLARE @Start  int

DECLARE @End    int

DECLARE @Length int

— Replace the HTML entity &amp; with the ‘&’ character (this needs to be done first, as

— ‘&’ might be double encoded as ‘&amp;amp;’)

SET @Start = CHARINDEX(‘&amp;’, @HTMLText)

SET @End = @Start + 4

SET @Length = (@End – @Start) + 1

WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN

SET @HTMLText = STUFF(@HTMLText, @Start, @Length, ‘&’)

SET @Start = CHARINDEX(‘&amp;’, @HTMLText)

SET @End = @Start + 4

SET @Length = (@End – @Start) + 1

END

— Replace the HTML entity &lt; with the ‘<‘ character

SET @Start = CHARINDEX(‘&lt;’, @HTMLText)

SET @End = @Start + 3

SET @Length = (@End – @Start) + 1

WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN

SET @HTMLText = STUFF(@HTMLText, @Start, @Length, ‘<‘)

SET @Start = CHARINDEX(‘&lt;’, @HTMLText)

SET @End = @Start + 3

SET @Length = (@End – @Start) + 1

END

— Replace the HTML entity &gt; with the ‘>’ character

SET @Start = CHARINDEX(‘&gt;’, @HTMLText)

SET @End = @Start + 3

SET @Length = (@End – @Start) + 1

WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN

SET @HTMLText = STUFF(@HTMLText, @Start, @Length, ‘>’)

SET @Start = CHARINDEX(‘&gt;’, @HTMLText)

SET @End = @Start + 3

SET @Length = (@End – @Start) + 1

END

— Replace the HTML entity &amp; with the ‘&’ character

SET @Start = CHARINDEX(‘&amp;amp;’, @HTMLText)

SET @End = @Start + 4

SET @Length = (@End – @Start) + 1

WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN

SET @HTMLText = STUFF(@HTMLText, @Start, @Length, ‘&’)

SET @Start = CHARINDEX(‘&amp;amp;’, @HTMLText)

SET @End = @Start + 4

SET @Length = (@End – @Start) + 1

END

— Replace the HTML entity &nbsp; with the ‘ ‘ character

SET @Start = CHARINDEX(‘&nbsp;’, @HTMLText)

SET @End = @Start + 5

SET @Length = (@End – @Start) + 1

WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN

SET @HTMLText = STUFF(@HTMLText, @Start, @Length, ‘ ‘)

SET @Start = CHARINDEX(‘&nbsp;’, @HTMLText)

SET @End = @Start + 5

SET @Length = (@End – @Start) + 1

END

— Replace any <br> tags with a newline

SET @Start = CHARINDEX(‘<br>’, @HTMLText)

SET @End = @Start + 3

SET @Length = (@End – @Start) + 1

WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN

SET @HTMLText = STUFF(@HTMLText, @Start, @Length, CHAR(13) + CHAR(10))

SET @Start = CHARINDEX(‘<br>’, @HTMLText)

SET @End = @Start + 3

SET @Length = (@End – @Start) + 1

END

— Replace any <br/> tags with a newline

SET @Start = CHARINDEX(‘<br/>’, @HTMLText)

SET @End = @Start + 4

SET @Length = (@End – @Start) + 1

WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN

SET @HTMLText = STUFF(@HTMLText, @Start, @Length, ‘CHAR(13) + CHAR(10)’)

SET @Start = CHARINDEX(‘<br/>’, @HTMLText)

SET @End = @Start + 4

SET @Length = (@End – @Start) + 1

END

— Replace any <br /> tags with a newline

SET @Start = CHARINDEX(‘<br />’, @HTMLText)

SET @End = @Start + 5

SET @Length = (@End – @Start) + 1

WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN

SET @HTMLText = STUFF(@HTMLText, @Start, @Length, ‘CHAR(13) + CHAR(10)’)

SET @Start = CHARINDEX(‘<br />’, @HTMLText)

SET @End = @Start + 5

SET @Length = (@End – @Start) + 1

END

— Remove anything between <whatever> tags

SET @Start = CHARINDEX(‘<‘, @HTMLText)

SET @End = CHARINDEX(‘>’, @HTMLText, CHARINDEX(‘<‘, @HTMLText))

SET @Length = (@End – @Start) + 1

WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN

SET @HTMLText = STUFF(@HTMLText, @Start, @Length, ”)

SET @Start = CHARINDEX(‘<‘, @HTMLText)

SET @End = CHARINDEX(‘>’, @HTMLText, CHARINDEX(‘<‘, @HTMLText))

SET @Length = (@End – @Start) + 1

END

RETURN LTRIM(RTRIM(@HTMLText))

END

Advertisements