Hive – testing queries with dummy data

By | August 8, 2017
If your query looks like “SELECT * FROM TABLE1;”
You want to test the input from “TABLE1” with your dummy dataset.
If you have a multiple subqueries using a base table. This comes very handy.


-- Creating single dummy row:
SELECT *
FROM
(
     -- This is our dummy row, which is a replacement of 'SELECT * FROM TABLE1;'
     SELECT
         "row11" AS col1
        ,"row12" AS col2
        ,"row13" AS col3
) t1 
;

-- Output:
+--------+--------+--------+--+
| col1 | col2 | col3 |
+--------+--------+--------+--+
| row11 | row12 | row13 |
+--------+--------+--------+--+
1 row selected (0.154 seconds)

-- Creating multiple dummy rows:
SELECT rownumber
     , col1
     , col2
     , col3
     , col4
FROM
(
     -- These are 2 dummy rows
     SELECT
     ARRAY(
            NAMED_STRUCT("rownumber" , "row1", "col1" ,"row11", "col2" ,"row12" ,"col3" ,"row13" , "col4" , ARRAY("value11", "value12", "value13"))
           ,NAMED_STRUCT("rownumber" , "row2","col1" ,"row21", "col2" ,"row12" ,"col3" ,"row23", "col4" , ARRAY("value21", "value22", "value23"))
      ) AS rows
) t1 
LATERAL VIEW OUTER INLINE(rows) rows_table AS rownumber, col1, col2, col3, col4
;

-- Output:
+------------+--------+--------+--------+----------------------------------+--+
| rownumber | col1 | col2 | col3 | col4 |
+------------+--------+--------+--------+----------------------------------+--+
| row1 | row11 | row12 | row13 | ["value11","value12","value13"] |
| row2 | row21 | row12 | row23 | ["value21","value22","value23"] |
+------------+--------+--------+--------+----------------------------------+--+
2 rows selected (45.364 seconds)

-- More complex example
SELECT
    rownumber
    , col1
    , col2
    , col3
    , col4_exploded
    , UNIX_TIMESTAMP(col5,"yyyy-MM-dd")
FROM
(
     SELECT rownumber
        , col1
        , col2
       , col3
       , col4
      , col5
    FROM
   (
       -- These are 3 dummy rows
       SELECT
           ARRAY(
              NAMED_STRUCT("rownumber" , "row1", "col1" ,"row11", "col2" ,"row12" ,"col3" ,"row13" , "col4" , ARRAY("value11", "value12", "value13") , "col5" , "2010-01-01 10:40:30")
             ,NAMED_STRUCT("rownumber" , "row2","col1" ,"row21", "col2" ,"row22" ,"col3" ,"row23", "col4" , ARRAY("value21", "value22", "value23"), "col5" , "2010-04-01 09:20:30")
            ,NAMED_STRUCT("rownumber" , "row3","col1" ,"row31", "col2" ,"row32" ,"col3" ,"row33", "col4" , ARRAY("value31", "value32", "value33"), "col5" , "2010-03-01 07:20:30")
          ) AS rows
     ) t1 
     LATERAL VIEW OUTER INLINE(rows) rows_table AS rownumber, col1, col2, col3, col4, col5
) lhs 
LATERAL VIEW OUTER EXPLODE(col4) col4_table AS col4_exploded
;
-- Output:
+------------+--------+--------+--------+----------------+-------------+--+
| rownumber | col1 | col2 | col3 | col4_exploded | _c5 |
+------------+--------+--------+--------+----------------+-------------+--+
| row1 | row11 | row12 | row13 | value11 | 1262304000 |
| row1 | row11 | row12 | row13 | value12 | 1262304000 |
| row1 | row11 | row12 | row13 | value13 | 1262304000 |
| row2 | row21 | row22 | row23 | value21 | 1270080000 |
| row2 | row21 | row22 | row23 | value22 | 1270080000 |
| row2 | row21 | row22 | row23 | value23 | 1270080000 |
| row3 | row31 | row32 | row33 | value31 | 1267401600 |
| row3 | row31 | row32 | row33 | value32 | 1267401600 |
| row3 | row31 | row32 | row33 | value33 | 1267401600 |
+------------+--------+--------+--------+----------------+-------------+--+
9 rows selected (25.001 seconds)
 
-- Parameterize your dummy input query
SET hivevar:DUMMY_DATASET=SELECT
ARRAY(
NAMED_STRUCT("rownumber" , "row1", "col1" ,"row11", "col2" ,"row12" ,"col3" ,"row13" , "col4" , ARRAY("value11", "value12", "value13") , "col5" , "2010-01-01 10:40:30")
,NAMED_STRUCT("rownumber" , "row2","col1" ,"row21", "col2" ,"row22" ,"col3" ,"row23", "col4" , ARRAY("value21", "value22", "value23"), "col5" , "2010-04-01 09:20:30")
,NAMED_STRUCT("rownumber" , "row3","col1" ,"row31", "col2" ,"row32" ,"col3" ,"row33", "col4" , ARRAY("value31", "value32", "value33"), "col5" , "2010-03-01 07:20:30")
) AS rows;


-- Using the dummy dataset from parameter '${DUMMY_DATASET}'
SELECT
rownumber
, col1
, col2
, col3
, col4_exploded
, UNIX_TIMESTAMP(col5,"yyyy-MM-dd")
FROM
(
SELECT rownumber
, col1
, col2
, col3
, col4
, col5
FROM
(
     ${DUMMY_DATASET}
) t1
LATERAL VIEW OUTER INLINE(rows) rows_table AS rownumber, col1, col2, col3, col4, col5
) lhs
LATERAL VIEW OUTER EXPLODE(col4) col4_table AS col4_exploded
;

-- Output:
+------------+--------+--------+--------+----------------+-------------+--+
| rownumber | col1 | col2 | col3 | col4_exploded | _c5 |
+------------+--------+--------+--------+----------------+-------------+--+
| row1 | row11 | row12 | row13 | value11 | 1262304000 |
| row1 | row11 | row12 | row13 | value12 | 1262304000 |
| row1 | row11 | row12 | row13 | value13 | 1262304000 |
| row2 | row21 | row22 | row23 | value21 | 1270080000 |
| row2 | row21 | row22 | row23 | value22 | 1270080000 |
| row2 | row21 | row22 | row23 | value23 | 1270080000 |
| row3 | row31 | row32 | row33 | value31 | 1267401600 |
| row3 | row31 | row32 | row33 | value32 | 1267401600 |
| row3 | row31 | row32 | row33 | value33 | 1267401600 |
+------------+--------+--------+--------+----------------+-------------+--+
9 rows selected (47.058 seconds)

Leave a Reply

Your email address will not be published. Required fields are marked *