Showing posts with label ScriptComponent. Show all posts
Showing posts with label ScriptComponent. Show all posts

Saturday, October 6, 2012

BIML BigData: Reading a RSS feed

Bigdata is HOT!

Based on the twitter feed reader I made here (Biml101: Reading Twitter Feed), I made a simple RSS feed reader using BIML.

The Feed URI term is passed trough the variable RSSURI. Be aware not to poll the RSS feeder every second, it probably will exclude your IP. For most feeds once every hour is enough! The package will only retrieve “new” feed items which are not jet in your table.

The Script:

<Biml xmlns="http://schemas.varigence.com/biml.xsd">
    <Annotations>
        <Annotation>
            File: RSS.Feed.Reader.biml
            Demo on reading a twitter feed
            Enviroment:
            DB: MS-SQL2012
            BIML: 1.6 VS2010 BIDS Helper
            (c) John Minkjan biml101.blogspot.com
        </Annotation>
        <Annotation>
            RSSFeedReader Table Create script:
           
            CREATE TABLE [dbo].[RSSFeedReader](
            [ChannelTitle] [nvarchar](255) NULL,
            [ChannelDescription] [nvarchar](2048) NULL,
            [ChannelLink] [nvarchar](255) NULL,
            [ChannelLanguage] [nvarchar](255) NULL,
            [ItemDescription] [nvarchar](2048) NULL,
            [ItemGuid] [nvarchar](128) NOT NULL,
            [ItemTitle] [nvarchar](255) NOT NULL,
            [ItemPubDate] [nvarchar](50) NOT NULL,
            [ItemLink] [nvarchar](255) NULL,
            [RSSURI] [nvarchar](255) NULL,
            CONSTRAINT [PK_RSSFeedReader] PRIMARY KEY CLUSTERED
            (
            [ItemGuid] ASC,
            [ItemTitle] ASC,
            [ItemPubDate] ASC
            )
            )
        </Annotation>
    </Annotations>
    <Connections>
        <OleDbConnection
    Name="CnOleDBAdventureWorks2012"
    ConnectionString="Data Source=.\MSSQL2012;Initial Catalog=AdventureWorks2012;Provider=SQLNCLI10.1;Integrated Security=SSPI;Connect Timeout=30;"
    RetainSameConnection="true">
        </OleDbConnection>
    </Connections>
   
    <ScriptProjects>
        <ScriptComponentProject Name="SC_RSS_READER">
            <AssemblyReferences>
                <AssemblyReference AssemblyPath="Microsoft.SqlServer.DTSPipelineWrap" />
                <AssemblyReference AssemblyPath="Microsoft.SqlServer.DTSRuntimeWrap" />
                <AssemblyReference AssemblyPath="Microsoft.SqlServer.PipelineHost" />
                <AssemblyReference AssemblyPath="Microsoft.SqlServer.TxScript" />
                <AssemblyReference AssemblyPath="System.dll" />
                <AssemblyReference AssemblyPath="System.AddIn.dll" />
                <AssemblyReference AssemblyPath="System.Data.dll" />
                <AssemblyReference AssemblyPath="System.Xml.dll" />
            </AssemblyReferences>
            <OutputBuffers>
                <OutputBuffer Name="Output0" IsSynchronous ="false">
                    <Annotations>
                        <Annotation>
                            IsSynchronous="false" ==>
                            When there are no non-synchronus buffers,
                            the compiler does not emit the CreateNewOutputRows
                            virtual base method (to match the BIDS/SSDT behavior)
                        </Annotation>
                    </Annotations>
                    <Columns>
                        <Column Name="ChannelTitle" DataType="String" Length="255"></Column>
                        <Column Name="ChannelDescription" DataType="String" Length="2048"></Column>
                        <Column Name="ChannelLink" DataType="String" Length="255"></Column>
                        <Column Name="ChannelLanguage" DataType="String" Length="255"></Column>
                        <Column Name="ItemDescription" DataType="String" Length="2048"></Column>
                        <Column Name="ItemGuid" DataType="String" Length="128"></Column>
                        <Column Name="ItemTitle" DataType="String" Length="255"></Column>
                        <Column Name="ItemPubDate" DataType="String" Length="50"></Column>
                        <Column Name="ItemLink" DataType="String" Length="255"></Column>
                        <Column Name="RSSURI" DataType="String" Length="255"></Column>
                    </Columns>                   
                </OutputBuffer>                                
            </OutputBuffers>
            <ReadOnlyVariables>               
                <Variable Namespace="User" VariableName="RSSURI" DataType="String"></Variable>
            </ReadOnlyVariables>
            <Files>
                <File Path="AssemblyInfo.cs">
                    using System.Reflection;
                    using System.Runtime.CompilerServices;
                    [assembly: AssemblyTitle("SC_RSS_READER")]
                    [assembly: AssemblyDescription("")]
                    [assembly: AssemblyConfiguration("")]
                    [assembly: AssemblyCompany("")]
                    [assembly: AssemblyProduct("SC_RSS_READER")]
                    [assembly: AssemblyCopyright("Copyright @  2012")]
                    [assembly: AssemblyTrademark("")]
                    [assembly: AssemblyCulture("")]
                    [assembly: AssemblyVersion("1.0.*")]
                </File>
                <File Path="main.cs">
                    <![CDATA[
                    using System;
                    using System.Data;
                    using Microsoft.SqlServer.Dts.Pipeline.Wrapper;
                    using Microsoft.SqlServer.Dts.Runtime.Wrapper;
                    using System.Xml;

                    [Microsoft.SqlServer.Dts.Pipeline.SSISScriptComponentEntryPointAttribute]
                    public class ScriptMain : UserComponent
                    {

                    public override void CreateNewOutputRows()
                    {
                    XmlTextReader rssReader;
                    XmlDocument rssDoc;
                    XmlNode nodeRss;
                    XmlNode nodeChannel;
                    XmlNode nodeItem;

                    string RSSURI = this.Variables.RSSURI.ToString();
                    rssReader = new XmlTextReader(RSSURI);
                    rssDoc = new XmlDocument();

                    rssDoc.Load(rssReader);

                    for (int i = 0; i < rssDoc.ChildNodes.Count; i++)
                    {

                    if (rssDoc.ChildNodes[i].Name == "rss")
                    {
                    nodeRss = rssDoc.ChildNodes[i];          

                    for (int j = 0; j < nodeRss.ChildNodes.Count; j++)
                    {
                    if (nodeRss.ChildNodes[j].Name == "channel")
                    {
                    nodeChannel = nodeRss.ChildNodes[j];
                                              
                    for (int k = 0; k < nodeChannel.ChildNodes.Count; k++)
                    {

                    if (nodeChannel.ChildNodes[k].Name == "item")
                    {
                    nodeItem = nodeChannel.ChildNodes[k];
                    Output0Buffer.AddRow();
                    Output0Buffer.ChannelTitle = nodeChannel["title"].InnerText;
                    Output0Buffer.ChannelDescription = nodeChannel["description"].InnerText;
                    Output0Buffer.ChannelLink = nodeChannel["link"].InnerText;
                    Output0Buffer.ChannelLanguage = nodeChannel["language"].InnerText;
                    Output0Buffer.ItemDescription = nodeItem["description"].InnerText;
                    Output0Buffer.ItemGuid = nodeItem["guid"].InnerText;
                    Output0Buffer.ItemTitle = nodeItem["title"].InnerText;
                    Output0Buffer.ItemPubDate = nodeItem["pubDate"].InnerText;
                    Output0Buffer.ItemLink = nodeItem["link"].InnerText;
                    Output0Buffer.RSSURI = RSSURI;                              
                    }
                    }
                    }
                    }
                    }
                    }
                    }

                    }
                    ]]>
                </File>
            </Files>
        </ScriptComponentProject>
    </ScriptProjects>
    <Packages>
        <Package Name ="RSS.Feed.Reader" ConstraintMode="Linear">
            <Variables>
                <Variable Name="RSSURI" DataType="String">http://rss.cnn.com/rss/edition.rss</Variable>               
            </Variables>
            <Tasks>
                <Dataflow Name ="DFT Read RSS Feed" >
                    <Transformations>
                        <ScriptComponentSource Name="SC RSS Feed Reader">
                            <ScriptComponentProjectReference ScriptComponentProjectName="SC_RSS_READER"></ScriptComponentProjectReference>
                        </ScriptComponentSource>
                        <Lookup Name="LKP Existing ItemGuids"
                                NoMatchBehavior="RedirectRowsToNoMatchOutput"
                                OleDbConnectionName="CnOleDBAdventureWorks2012" 
                                CacheMode="Partial">
                            <ExternalTableInput Table="[dbo].[RSSFeedReader]"></ExternalTableInput>
                            <Inputs>
                                <Column SourceColumn="ItemGuid" TargetColumn="ItemGuid"></Column>
                                <Column SourceColumn="ItemPubDate" TargetColumn="ItemPubDate"></Column>
                                <Column SourceColumn="ItemTitle" TargetColumn="ItemTitle"></Column>
                            </Inputs>
                        </Lookup>                   
                        <OleDbDestination Name="ODD dbo RSSFeedReader" ConnectionName="CnOleDBAdventureWorks2012">
                            <InputPath OutputPathName="LKP Existing ItemGuids.NoMatch"></InputPath>
                            <ExternalTableOutput Table="[dbo].[RSSFeedReader]"></ExternalTableOutput>
                        </OleDbDestination>
                    </Transformations>                   
                </Dataflow>
            </Tasks>
        </Package>
    </Packages>
</Biml>

The Script can also be found here: RSS.Feed.Reader.biml

Will give you:

image

Till Next Time (also published on BIMLSCRIPT.com ==> http://bimlscript.com/Snippet/Details/59)

Monday, September 17, 2012

BIML MD5 ScriptComponent Hash

Demo biml using the ScriptComponent to create an MD5 hash, very handy if you are not allowed to use open source altenatives like ==> http://ssismhash.codeplex.com/
<#@ template language="C#" hostspecific="true"#>
<#@ import namespace="System.Data"#>
<#@ import namespace="System.Data.SqlClient"#>
<#@ import namespace="System.IO"#>
<Biml xmlns="http://schemas.varigence.com/biml.xsd">
<Annotations>
        <Annotation>
            File: MD5.ScriptComponent.biml
            Demo biml using the ScriptComponent to
            create an MD5 hash.
            Enviroment:
            DB: MS-SQL2012
            BIML: 1.6 VS2010 BIDS Helper
            (c) John Minkjan biml101.blogspot.com
        </Annotation>
</Annotations>
<Connections>
<OleDbConnection
    Name="CnOleDBAdventureWorks2012"
    ConnectionString="Data Source=.\MSSQL2012;Initial Catalog=AdventureWorks2012;Provider=SQLNCLI10.1;Integrated Security=SSPI;Connect Timeout=30;"/>
</Connections>
   
<ScriptProjects>                       
    <ScriptComponentProject ProjectCoreName="sc_c253bef215bf4d6b85dbe3919c35c167.csproj" Name="SC Add MD5">
        <AssemblyReferences>
            <AssemblyReference AssemblyPath="Microsoft.SqlServer.DTSPipelineWrap" />
            <AssemblyReference AssemblyPath="Microsoft.SqlServer.DTSRuntimeWrap" />
            <AssemblyReference AssemblyPath="Microsoft.SqlServer.PipelineHost" />
            <AssemblyReference AssemblyPath="Microsoft.SqlServer.TxScript" />
            <AssemblyReference AssemblyPath="System.dll" />
            <AssemblyReference AssemblyPath="System.AddIn.dll" />
            <AssemblyReference AssemblyPath="System.Data.dll" />
            <AssemblyReference AssemblyPath="System.Xml.dll" />
        </AssemblyReferences>
        <Files>
            <File Path="AssemblyInfo.cs">
            using System.Reflection;
            using System.Runtime.CompilerServices;
            [assembly: AssemblyTitle("SC_c253bef215bf4d6b85dbe3919c35c167.csproj")]
            [assembly: AssemblyDescription("")]
            [assembly: AssemblyConfiguration("")]
            [assembly: AssemblyCompany("Ciber Nederland")]
            [assembly: AssemblyProduct("SC_c253bef215bf4d6b85dbe3919c35c167.csproj")]
            [assembly: AssemblyCopyright("Copyright @ Ciber Nederland 2012")]
            [assembly: AssemblyTrademark("")]
            [assembly: AssemblyCulture("")]
            [assembly: AssemblyVersion("1.0.*")]
            </File>
            <File Path="ScriptMain.cs">
            using System;
            using System.Data;
            using Microsoft.SqlServer.Dts.Pipeline.Wrapper;
            using Microsoft.SqlServer.Dts.Runtime.Wrapper;
            using System.Security.Cryptography;
            using System.Text;
            using System.IO;
            [Microsoft.SqlServer.Dts.Pipeline.SSISScriptComponentEntryPointAttribute]
            public class ScriptMain : UserComponent
            {
            static public string GetMd5Sum(string str)
            {
            Encoder enc = System.Text.Encoding.Unicode.GetEncoder();
            byte[] unicodeText = new byte[str.Length * 2];
            enc.GetBytes(str.ToCharArray(), 0, str.Length, unicodeText, 0, true);
            MD5 md5 = new MD5CryptoServiceProvider();
            byte[] result = md5.ComputeHash(unicodeText);
            StringBuilder sb = new StringBuilder();
            for (int i = 0; i != result.Length; i++)
            {
            sb.Append(result[i].ToString("X2"));
            }
            return sb.ToString();
            }
            public override void PreExecute()
            {
            base.PreExecute();
            }
            public override void PostExecute()
            {
            base.PostExecute();
            }
            public override void Input0_ProcessInputRow(Input0Buffer Row)
            {
            Row.MD5AllColumns = GetMd5Sum(Row.FirstName + Row.LastName);
            }
            }
            </File>
        </Files>
<InputBuffer Name="Input0">
    <Columns>
        <Column Name="MD5AllColumns" DataType="AnsiString" Length="50" UsageType="ReadWrite"></Column>
        <Column Name="FirstName" DataType="AnsiString" Length="50" UsageType="ReadWrite"></Column>
        <Column Name="LastName" DataType="AnsiString" Length="50" UsageType="ReadWrite"></Column>
    </Columns>
</InputBuffer>
    <OutputBuffers>
        <OutputBuffer Name="Output0">
    </OutputBuffer>
</OutputBuffers>
</ScriptComponentProject>
</ScriptProjects>
    <Packages>
        <Package  ConstraintMode="Parallel" Name="MD5.ScriptComponent">
        <Variables>
                <Variable Name ="RowCount"
                          DataType="Int32"
                          EvaluateAsExpression="false">0</Variable>
                <!--Stock symbol has to be enclosed in double quotes-->
            </Variables>
            <Tasks>
                <Container ConstraintMode ="Parallel" Name="SEQ Main">
                    <Tasks>
                        <Dataflow Name="DFT Main">
                            <Transformations>
                                <OleDbSource Name="ODS Adventureworks Person" ConnectionName="CnOleDBAdventureWorks2012">
                                    <ExternalTableInput Table = "[Person].[Person]"></ExternalTableInput>
                                </OleDbSource>
                                <DerivedColumns Name="DC MD5">
                                <InputPath OutputPathName="ODS Adventureworks Person.Output"></InputPath>
                                <Columns>
                                    <Column Name="MD5AllColumns" DataType="AnsiString" Length="50">"--"</Column>                                   
                                </Columns>
                            </DerivedColumns>
                                <ScriptComponentTransformation Name="SC Add MD5">
                                <InputPath OutputPathName="DC MD5.Output"></InputPath>
                                <ScriptComponentProjectReference ScriptComponentProjectName="SC Add MD5" />                               
                            </ScriptComponentTransformation>
                            <!-- Dummy Rowcount to show results in dataviewer -->
                            <RowCount VariableName="User.RowCount" Name="RC Input">
                                <InputPath OutputPathName="SC Add MD5.Output0"></InputPath>
                            </RowCount>
                            </Transformations>
                        </Dataflow>
                    </Tasks>
                </Container>
            </Tasks>
        </Package>
    </Packages>
</Biml>

Will get you:
image
Till Next Time