Wednesday, March 7, 2012

Convert RTF to plain text in sql

Hi All,
Can anyone tell me how to convert RTF formatted data into plain text?
I have a column in sql server 2005 table which stores the rtf data. I
want to convert and store this data as plain text in another table.
So then I can display this data in my SSRS 2005 report.
Thanks,RTF data is plain text which has some pieces of the text marked up by
XML tags. There are a couple of versions of RTF, and the type of RTF
data you have will be marked by the first XML tag you see in your RTF
document. RTF also allows authors to create custom (non-standard) RTF
tags which may appear in your RTF data.
Below appears the core code we use to extract text from RTF files. It
is written in VB.NET 1.1
'RTFreader is a customized .NET binary reader that has a method added
to it to
'parse out text from an rtf file. Uses same methods and properties as
binary
'reader does.
Public Class RTFreader
Inherits System.IO.BinaryReader
Const openCurlyBrace As System.Byte = Asc("{")
Const closedCurlyBrace As System.Byte = Asc("}")
Const openRoundBrace As System.Byte = Asc("(")
Const closedRoundBrace As System.Byte = Asc(")")
Const doubleQuote As System.Byte = 34
Const backSlash As System.Byte = Asc("\")
Const hyphen As System.Byte = Asc("-")
Const space As System.Byte = Asc(" ")
Const asterisk As System.Byte = Asc("*")
Const apostroph As System.Byte = Asc("'")
Const semicolon As System.Byte = Asc(";")
Const nullSpace As System.Byte = 0
Const tab As System.Byte = 9
Const linefeed As System.Byte = 10
Const carriageReturn As System.Byte = 13
Const forwardSlash As System.Byte = Asc("/")
Const pipe As System.Byte = Asc("|")
Const underscore As System.Byte = Asc("_")
Const tilde As System.Byte = Asc("~")
Const lessThan As System.Byte = Asc("<")
Const greaterThan As System.Byte = Asc(">")
'CONSTRUCTOR
Sub New(ByVal st As System.IO.Stream)
MyBase.New(st)
End Sub
'READTEXT
'strips tabs,carriage returns, line feeds, and non-alphabetic
characters and returns everything else
Public Function ReadText(ByVal count As System.Int32, ByRef
strError As System.String, Optional ByVal keyword As System.String ="") As System.String
Dim result As System.String = "OK"
Dim text As System.String = ""
Dim prevchar As Byte = 0
Dim l As System.Int32 = 0
Dim p As System.Int32 = 0
Dim prev As System.Byte = 0
Try
Dim rtfData() As Byte = MyBase.ReadBytes(count)
l = rtfData.Length() - 1
Dim i As System.Int32 = 0
'parse the rtfData and copy characters out of it into
result
For i = 0 To l
Select Case rtfData(i) 'CURRENT CHARACTER
Case carriageReturn
'ignore
Case linefeed
'ignore
Case nullSpace
'ignore
Case Else 'its text
If rtfData(i) <> 32 Or (rtfData(i) = 32 And
prev <> 32) Then
text = text + Chr(rtfData(i))
prev = rtfData(i)
End If
End Select
Next
Catch of As OverflowException
result = "ReadText: " + rptError(of)
Catch re As NullReferenceException
result = "ReadText: " + rptError(re)
Catch ir As IndexOutOfRangeException
result = "ReadText: " + rptError(ir)
Catch io As IO.IOException
result = "ReadText: " + rptError(io)
Catch ex As Exception
result = "ReadText: " + rptError(ex)
End Try
If result = "OK" Then
If l + 1 >= count Then
strError = "MORE"
Else
strError = result
End If
Else
strError = result
End If
text = Replace(text, "[^\w\.@.-]", " ")
Return text
End Function
Public Function IsRTF() As System.Boolean
Dim blnResult As System.Boolean = False
Dim result As System.String = ""
Dim strError As System.String = ""
result = ReadRTF(100, strError, "rtf")
If Not (strError <> "OK" And strError <> "MORE") Then
If Len(result) >= 3 And Left(result, 3) = "rtf" Then
blnResult = True
End If
End If
Return blnResult
End Function
'READRTF
'strips control words from rtf text and returns just the text
Public Function ReadRTF(ByVal count As System.Int32, ByRef
strError As System.String, Optional ByVal keyword As System.String ="") As System.String
Dim result As System.String = "OK"
Dim text As System.String = ""
Dim ctrlSymbol As System.Char = ""
Dim prevchar As Byte = 0
Dim l As System.Int32 = 0
Dim p As System.Int32 = 0
Try
Dim rtfData() As Byte = MyBase.ReadBytes(count)
l = rtfData.Length() - 1
Dim i As System.Int32 = 0
Dim controlWord As System.String = ""
Dim controlSymbol As System.Byte = 0
Dim curlyCount As System.Int32 = 0
Dim parm As System.String = ""
Dim junk As System.String = ""
'parse the rtfData and copy characters out of it into
result
For i = 0 To l
Select Case rtfData(i) 'CURRENT CHARACTER
Case openCurlyBrace 'START OF GROUP
curlyCount = curlyCount + 1
Case closedCurlyBrace 'END OF GROUP
curlyCount = curlyCount - 1
If curlyCount < 0 Then
curlyCount = 0
End If
Case backSlash 'THIS IS THE START OF A CONTROL
WORD or CONTROL SYMBOL
i = i + 1 'goto the next character to
determine what it is
If i <= l Then
If isLetter(rtfData(i)) Then 'its a
control word
controlWord = breakOutControlWord(i,
rtfData, l, parm)
If controlWord = keyword Then
text = controlWord + parm
Exit For
Else
Select Case controlWord
Case "fonttbl"
ignoreGroup(i, rtfData, l,
curlyCount)
Case "colortbl"
ignoreGroup(i, rtfData, l,
curlyCount)
Case "stylesheet"
ignoreGroup(i, rtfData, l,
curlyCount)
Case "listtable"
ignoreGroup(i, rtfData, l,
curlyCount)
Case "info"
ignoreGroup(i, rtfData, l,
curlyCount)
Case "template"
ignoreGroup(i, rtfData, l,
curlyCount)
Case "docvar"
ignoreGroup(i, rtfData, l,
curlyCount)
Case "datafield"
ignoreGroup(i, rtfData, l,
curlyCount)
Case "listlevel"
ignoreGroup(i, rtfData, l,
curlyCount)
Case "pnseclvl"
ignoreGroup(i, rtfData, l,
curlyCount)
Case "pict"
ignorePict(i, rtfData, l,
curlyCount)
Case "object"
ignoreObject(i, rtfData,
l, curlyCount)
Case "comment"
ignoreGroup(i, rtfData, l,
curlyCount)
Case "footnote"
ignoreGroup(i, rtfData, l,
curlyCount)
Case "footer"
ignoreGroup(i, rtfData, l,
curlyCount)
Case "footerl"
ignoreGroup(i, rtfData, l,
curlyCount)
Case "footerr"
ignoreGroup(i, rtfData, l,
curlyCount)
Case "footerf"
ignoreGroup(i, rtfData, l,
curlyCount)
Case "ftnsep"
ignoreGroup(i, rtfData, l,
curlyCount)
Case "ftnsepc"
ignoreGroup(i, rtfData, l,
curlyCount)
Case "ftncn"
ignoreGroup(i, rtfData, l,
curlyCount)
Case "pn"
ignoreGroup(i, rtfData, l,
curlyCount)
text = text + " "
Case "pntext"
ignoreGroup(i, rtfData, l,
curlyCount)
text = text + " "
Case "par"
text = text + " "
Case "tab"
text = text + " "
Case "sect"
text = text + " "
Case "lquote"
text = text + " "
Case "generator"
ignoreGroup(i, rtfData, l,
curlyCount)
Case "sn"
ignoreGroup(i, rtfData, l,
curlyCount)
Case "sv"
ignoreGroup(i, rtfData, l,
curlyCount)
End Select
End If
Else 'its a control symbol
controlSymbol =Asc(breakOutControlSymbol(i, rtfData, l))
If controlSymbol = 32 Or
isLetterOrDigit(controlSymbol) Then
text = text + Chr(controlSymbol)
End If
End If
End If
Case carriageReturn
'ignore
Case linefeed
'ignore
Case nullSpace
'ignore
Case Else 'its text
text = text + Chr(rtfData(i))
End Select
Next
Catch of As OverflowException
result = "ReadRTF: " + rptError(of)
Catch re As NullReferenceException
result = "ReadRTF: " + rptError(re)
Catch ir As IndexOutOfRangeException
result = "ReadRTF: " + rptError(ir)
Catch io As IO.IOException
result = "ReadRTF: " + rptError(io)
Catch ex As Exception
result = "ReadRTF: " + rptError(ex)
End Try
If result = "OK" Then
If l + 1 >= count Then
strError = "MORE"
Else
strError = result
End If
Else
strError = result
End If
text = Replace(text, "[^\w\.@.-]", " ")
Return text
End Function
Private Function dump(ByRef i As System.Int32, ByRef rtfData() As
Byte, ByVal l As System.Int32, ByRef w As System.Int32) As
System.String
Dim result As System.String = ""
Dim s As System.Int32 = i
Dim e As System.Int32 = i + w
Dim c As System.Int32 = 0
For c = s To e
result = result + Chr(rtfData(c))
Next
Return " >>DUMP:" + result + "<<< "
End Function
Private Sub ignoreObject(ByRef i As System.Int32, ByRef rtfData()
As Byte, ByVal l As System.Int32, ByRef curlyCount As System.Int32)
Dim startCurlyCount As System.Int32 = curlyCount
Dim controlWord As System.String = ""
Dim parm As System.String = ""
Dim blockEnd As System.Int32 = 0
i = i + 1
While i <= l And curlyCount >= startCurlyCount
Select Case rtfData(i)
Case openCurlyBrace
curlyCount = curlyCount + 1
Case closedCurlyBrace
curlyCount = curlyCount - 1
If curlyCount < 0 Then
curlyCount = 0
End If
Case carriageReturn
'ignore
Case linefeed
'ignore
Case space
'ignore (we shouldn't be processing text in a
object)
Case backSlash 'THIS IS THE START OF A CONTROL WORD or
CONTROL SYMBOL
i = i + 1 'goto the next character to determine
what it is
If i <= l Then
If isLetter(rtfData(i)) Then 'its a control
word
controlWord = breakOutControlWord(i,
rtfData, l, parm)
Select Case controlWord
Case "objdata"
blockEnd = Array.IndexOf(rtfData,
closedCurlyBrace, i)
If blockEnd > 0 Then
i = blockEnd
curlyCount = curlyCount - 1
If curlyCount < 0 Then
curlyCount = 0
End If
End If
End Select
End If
End If
End Select
i = i + 1
End While
i = i - 1 'when the last curly bracket is done, we are one
past the position of the closing bracket
End Sub
Private Sub ignorePict(ByRef i As System.Int32, ByRef rtfData() As
Byte, ByVal l As System.Int32, ByRef curlyCount As System.Int32)
Dim startCurlyCount As System.Int32 = curlyCount
Dim controlWord As System.String = ""
Dim parm As System.String = ""
Dim blockEnd As System.Int32 = 0
i = i + 1 ' move off of the /pict control word to parse what
is after it
While i <= l And curlyCount >= startCurlyCount
Select Case rtfData(i)
Case openCurlyBrace
curlyCount = curlyCount + 1
Case closedCurlyBrace
curlyCount = curlyCount - 1
If curlyCount < 0 Then
curlyCount = 0
End If
Case carriageReturn
'ignore
Case linefeed
'ignore
Case space
'ignore (we shouldn't be processing text in a
picture)
Case backSlash 'THIS IS THE START OF A CONTROL WORD or
CONTROL SYMBOL
i = i + 1 'goto the next character to determine
what it is
If i <= l Then
If isLetter(rtfData(i)) Then 'its a control
word
controlWord = breakOutControlWord(i,
rtfData, l, parm)
End If
End If
Case Else 'its plain text on the \pict tag
If curlyCount = startCurlyCount Then
blockEnd = Array.IndexOf(rtfData,
closedCurlyBrace, i, l - i)
If blockEnd > 0 Then
i = blockEnd
curlyCount = curlyCount - 1
If curlyCount < 0 Then
curlyCount = 0
End If
End If
End If
End Select
i = i + 1
End While
i = i - 1 'when the last curly bracket is done, we are one
past the position of the closing bracket
End Sub
Private Sub ignoreGroup(ByRef i As System.Int32, ByRef rtfData()
As Byte, ByVal l As System.Int32, ByRef curlyCount As System.Int32)
Dim startCurlyCount As System.Int32 = curlyCount
While i <= l And curlyCount >= startCurlyCount
Select Case rtfData(i)
Case openCurlyBrace
curlyCount = curlyCount + 1
Case closedCurlyBrace
curlyCount = curlyCount - 1
If curlyCount < 0 Then
curlyCount = 0
End If
End Select
i = i + 1
End While
i = i - 1 'when the last curly bracket is done, we are one
past the position of the closing bracket
'so we have to backup by 1
End Sub
Private Function breakOutGroup(ByRef i As System.Int32, ByRef
rtfData() As Byte, ByVal l As System.Int32, ByRef curlyCount As
System.Int32) As System.String
Dim result As System.String = ""
Dim startCurlyCount As System.Int32 = curlyCount
While i <= l And curlyCount >= startCurlyCount
Select Case rtfData(i)
Case openCurlyBrace
curlyCount = curlyCount + 1
Case closedCurlyBrace
curlyCount = curlyCount - 1
If curlyCount < 0 Then
curlyCount = 0
End If
End Select
result = result + Chr(rtfData(i))
i = i + 1
End While
i = i - 1 'when the last curly bracket is done, we are one
past the position of the closing bracket
'so we have to backup by 1
Return result
End Function
Private Function breakOutControlWord(ByRef i As System.Int32,
ByRef rtfData() As Byte, ByVal l As System.Int32, ByRef parm As
System.String) As System.String
Dim ctrlWord As System.String = ""
'get the control word (which is made out of only letters)
While i <= l And isLetter(rtfData(i))
ctrlWord = ctrlWord + LCase(Chr(rtfData(i)))
i = i + 1
End While
'check if there's a parameter
If isParm(rtfData(i)) Then
If rtfData(i) = hyphen Then
i = i + 1 'we want to get the value which follows the
hyphen
End If
parm = breakOutParm(i, rtfData, l)
ElseIf rtfData(i) <> space Then
i = i - 1
End If
Return Trim(ctrlWord)
End Function
Private Function breakOutControlSymbol(ByRef i As System.Int32,
ByRef rtfData() As Byte, ByVal l As System.Int32) As System.Char
Dim result As System.Char = ""
Dim strValue As System.String = ""
Dim nValue As System.Int16 = 0
Select Case rtfData(i)
Case asterisk
'code in group belongs to new RTF standard
result = ""
Case backSlash
result = Chr(rtfData(i))
Case openCurlyBrace
result = Chr(rtfData(i))
Case closedCurlyBrace
result = Chr(rtfData(i))
Case pipe
result = "F"
Case tilde
result = " "
Case hyphen
result = "-"
Case underscore
result = "-"
Case nullSpace
result = "-"
Case apostroph
result = " "
i = i + 1
strValue = breakOutParm(i, rtfData, l)
nValue = Val("&H" + strValue)
If nValue > 0 And nValue <= 255 Then
If isLetterOrDigit(nValue) = True Then
result = Chr(nValue)
End If
End If
Case lessThan
result = " "
i = i + 1
strValue = breakOutParm(i, rtfData, l)
nValue = Val(strValue)
If nValue > 0 And nValue <= 255 Then
If isLetterOrDigit(nValue) = True Then
result = Chr(nValue)
End If
End If
Case Else
result = ""
End Select
Return result
End Function
Private Function breakOutParm(ByRef i As System.Int32, ByRef
rtfData() As Byte, ByVal l As System.Int32) As System.String
Dim result As System.String = ""
Dim value As System.String = ""
While i <= l And isLetterOrDigit(rtfData(i))
value = value + Chr(rtfData(i))
i = i + 1
End While
If rtfData(i) <> space Then
i = i - 1
End If
Return value
End Function
Private Function isLetter(ByVal nLetter As System.Byte) As
System.Boolean
Dim blnResult As System.Boolean = False
Dim code As System.Int16 = 0
If nLetter >= 97 Then
code = nLetter - 32
Else
code = nLetter
End If
If code >= 65 And code <= 90 Then
blnResult = True
End If
Return blnResult
End Function
Private Function isDigit(ByVal code As System.Byte) As
System.Boolean
Dim blnResult As System.Boolean = False
If code >= 48 And code <= 57 Then
blnResult = True
End If
Return blnResult
End Function
Private Function isLetterOrDigit(ByVal code As System.Byte) As
System.Boolean
Dim blnResult As System.Boolean = False
If isDigit(code) Or isLetter(code) Then
blnResult = True
End If
Return blnResult
End Function
Private Function isParm(ByVal code As System.Byte) As
System.Boolean
Dim blnResult As System.Boolean = False
If isDigit(code) Or code = hyphen Then
blnResult = True
End If
Return blnResult
End Function
Private Function isDelimeter(ByVal code As System.Byte) As
System.Boolean
Dim blnResult As System.Boolean = False
If Not isLetterOrDigit(code) Then
blnResult = True
End If
Return blnResult
End Function
Private Function rptError(ByVal objError As System.Object) As
System.String
Dim result As System.String
Select Case objError.GetType.Name
Case "HttpException"
result = "http exception: " + vbCrLf + "HTTP Error: "
+ objError.GetHttpCode.ToString + vbCrLf +
objError.GetHtmlErrorMessage + vbCrLf + vbCrLf +
objError.ErrorCode.ToString + " " + objError.Message + vbCrLf +
objError.Source + vbCrLf + objError.StackTrace
Case "SoapException"
result = "soap exception: " + vbCrLf +
objError.Message + vbCrLf + objError.StackTrace + vbCrLf
Case "WebException"
result = "web exception: " + vbCrLf +
objError.Status.ToString() + ":" + vbCrLf + objError.Message + vbCrLf
Case "XmlException"
result = "xml exception: " + objError.Source + " on
line " + objError.LineNumber.ToString + " at position " +
objError.LinePosition.ToString + vbCrLf + objError.Message + vbCrLf +
objError.StackTrace
Case "SqlException"
result = "SQL exception: " +
objError.Number.ToString() + " " + objError.Message + vbCrLf +
objError.Server
Case "FormatException"
result = "Numeric Format exception: " +
objError.Source + vbCrLf + objError.Message + vbCrLf +
objError.StackTrace
Case "Exception"
result = "exception: " + vbCrLf + objError.Message +
vbCrLf + objError.Source + vbCrLf + objError.StackTrace
Case "String"
result = "Error: " + objError
Case "OverflowException"
result = "OverflowException: " + objError.Source +
vbCrLf + objError.Message + vbCrLf + objError.StackTrace
Case "IndexOutOfRangeException"
result = "Index exception: " + objError.Source +
vbCrLf + objError.Message + vbCrLf + objError.StackTrace
Case "NullReferenceException"
result = "Null reference exception: " +
objError.Source + vbCrLf + objError.Message + vbCrLf +
objError.StackTrace
Case "InvalidOperationException"
result = "Invalid Operation: " + objError.Source +
vbCrLf + objError.Message + vbCrLf + objError.HelpLink + vbCrLf +
objError.StackTrace
Case "ArgumentException"
result = "Bad parameter value for " +
objError.ParamName + ":" + vbCrLf + objError.Source + vbCrLf +
objError.Message + vbCrLf + objError.HelpLink + vbCrLf +
objError.StackTrace
Case "InvalidCastException"
result = "Type mis-match: " + objError.Source + vbCrLf
+ objError.Message + vbCrLf + objError.StackTrace
Case "XPathException"
result = "Bad XPath: " + objError.Source + vbCrLf +
objError.Message + vbCrLf + objError.StackTrace
Case Else
result = "An unanticipated error has occured for " +
objError.GetType.Name
End Select
Return result
End Function
End Class

No comments:

Post a Comment