a little nugget to clean up a raw data import.

I may be reinventing the wheel here, but it was late and was having a hard time googling for some reason. Needed something that I could hand off to someone to go crazy with in-loading data from raw text files, so I whipped this up.



/* 
Typically you would right click a db > Task > Import Data.
After data is imported you can use this script to create a cleaner base table.
Biggest headache is typically trying to identify the proper data type and lengths
This script attempts to handle the most common denominator, INT, DateTime, VarChar, and Decimal.

@LastColumn quirk, if importing CSV and last column contains "," the data is bad, so make best
effort to load and then go to source and work it out.

End result is Print statement that will create table x(base table) and
an insert into statement

*/
SET NOCOUNT ON
DECLARE
@ROWSAFFECTED INT
, @Count INT
, @Counter INT
, @ColumName VARCHAR(128)
, @LastColumn VARCHAR(128)
, @TableName VARCHAR(128)
, @dSQL VARCHAR(4000)
, @SaidTABLE VARCHAR(128)

SELECT
@SaidTABLE = 'CampaignResults_raw'
,@LastColumn = 'DateCreated'

IF OBJECT_ID('tempdb..#tCol') IS NOT NULL
DROP TABLE #tCol

SELECT
ORDINAL_POSITION IDX
, TABLE_NAME
, COLUMN_NAME
, CASE WHEN RIGHT(COLUMN_NAME, 2) = 'id' THEN 'INT'
WHEN COLUMN_NAME LIKE '%date%'
OR COLUMN_NAME LIKE '%time%' THEN 'DateTime'
ELSE 'VARCHAR'
END DT
, 0 MaxLen
INTO
#tCOL
FROM
INFORMATION_SCHEMA.Columns
WHERE
TABLE_NAME = @SaidTABLE
ORDER BY ORDINAL_POSITION

SELECT
@ROWSAFFECTED = @@ROWCOUNT
, @Counter = 1

IF @ROWSAFFECTED = 0
RETURN

SELECT
@Count = @ROWSAFFECTED
WHILE @Counter <= @Count
BEGIN
SELECT
@ColumName = COLUMN_NAME
, @TableName = TABLE_NAME
FROM
#tCOL
WHERE
IDX = @Counter

SELECT
@dSQL = 'UPDATE t
SET DT = CASE WHEN xINT = 0 THEN ''INT''
WHEN xDecimal =0 THEN ''Decimal''
WHEN xDate <0 THEN ''DATE''
ELSE ''VARCHAR'' END
,MaxLen = c.MaxLen
FROM
#tCOL t
JOIN
(
SELECT
' + LTRIM(STR(@counter)) + ' IDX
,SUM(CASE WHEN ISNUMERIC(LTRIM(RTRIM( [' + @ColumName
+ '] + ''e0'' ))) = 1 AND [' + @ColumName + '] NOT LIKE ''%.%'' THEN 0 ELSE 1 END) xINT
,SUM(CASE WHEN ISNUMERIC(LTRIM(RTRIM( [' + @ColumName
+ '] + ''e0'' ))) = 1 AND [' + @ColumName + '] LIKE ''%.%'' THEN 0 ELSE 1 END) xDecimal
,SUM(CASE WHEN ISDATE(LTRIM(RTRIM([' + @ColumName
+ ']))) = 1 THEN -1 ELSE 1 END) xDate
,MAX(LEN(RTRIM(LTRIM([' + @ColumName + '])))) MaxLen
FROM
' + @TableName + '
WHERE
[' + @LastColumn + '] NOT LIKE ''%,%''
AND [' + @ColumName
+ '] IS NOT NULL) c ON c.IDX = t.IDX AND t.IDX = '
+ LTRIM(STR(@counter))
--PRINT @dSQL
EXEC (@dSQL)
SELECT
@Counter = @Counter + 1
END

DECLARE @tSTR VARCHAR(8000)

SELECT
@tSTR = REPLACE('CREATE TABLE x('
+ (
SELECT
'
' + CASE WHEN IDX = 1 THEN ''
ELSE ','
END + '[' + COLUMN_NAME + ']' + CASE WHEN DT = 'INT' THEN ' [INT]'
WHEN DT = 'Decimal' THEN ' [Decimal]'
WHEN DT = 'DATE' THEN ' [DateTime]'
ELSE ' VarChar(' + LTRIM(STR(MaxLen))
+ ')'
END
FROM
#tCOL
ORDER BY IDX
FOR
XML PATH('')
), '&#x0D;', '') + ')'
PRINT @tSTR

--DECLARE @tSTR VARCHAR(8000)

SELECT
@tSTR = REPLACE('INSERT INTO X SELECT '
+ (
SELECT
'
' + CASE WHEN IDX = 1 THEN ''
ELSE ','
END + 'NULLIF(' + CASE WHEN DT IN ('INT','Decimal','Date') THEN '[' + COLUMN_NAME + ']'
ELSE 'LTRIM(RTRIM([' + COLUMN_NAME + ']))'
END + ' ,''NULL'')'
FROM
#tCOL
--where DT = 'INT'
ORDER BY IDX
FOR
XML PATH('')
), '&#x0D;', '') + ' FROM ' + @SaidTable + '
WHERE [' + @LastColumn + '] NOT LIKE ''%,%'''

PRINT @tSTR

~Enjoy

Comments :

0 comments to “a little nugget to clean up a raw data import.”