Introduction
This post will demonstrate data ingestion and performing aggregate summaries using SQL & R. Specifically, we will connect to a remote SQL server using ODBC
package. This package is currently not available in CRAN but can be downloaded from the here.
Then using the very powerful data.table
package in R, we will do some relevant summaries.
ODBC Demo
The below code will load the DBI
library. This library is installed when ODBC
package is installed.
#devtools::install_github("rstats-db/odbc")
library(DBI)
#connectionString <- "Driver=SQL Server;Server={Servern name here};Database={Database name here};Uid={User Id here};Pwd={Password here};Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;"
#Set up a channel to the SQL server Database
con <- dbConnect(odbc::odbc(),
.connection_string = connectionString)
We can check the connection by checking the tables unders a specific schema.
dbListTables(con, schema = "bi" )
## [1] "date" "geo" "manufacturer" "product"
## [5] "salesFact" "sentiment"
R Notebooks also allow to run SQL chunks directly.
SELECT COUNT(*) FROM bi.geo
99618 |
Just another example!
SELECT TOP 10 sf.ProductID, SUM(sf.Units) as Total, p.Product, p.Category, p.Segment
FROM bi.salesFact as sf
INNER JOIN bi.product as p
ON p.ProductID = sf.ProductID
GROUP BY sf.ProductID, p.Product, p.Category, p.Segment
ProductID | Total | Product | Category | Segment |
---|---|---|---|---|
1 | 488 | Abbas MA-01 | Mix | All Season |
3 | 8515 | Abbas MA-03 | Mix | All Season |
4 | 1912 | Abbas MA-04 | Mix | All Season |
5 | 18 | Abbas MA-05 | Mix | All Season |
7 | 362 | Abbas MA-07 | Mix | All Season |
8 | 6459 | Abbas MA-08 | Mix | All Season |
9 | 5452 | Abbas MA-09 | Mix | All Season |
10 | 161 | Abbas MA-10 | Mix | All Season |
11 | 17 | Abbas MA-11 | Mix | All Season |
12 | 272 | Abbas MA-12 | Mix | All Season |
Creating data frame using odbc connection
qry <- "SELECT sf.ProductID, SUM(sf.Units) as Total, p.Product, p.Category, p.Segment
FROM bi.salesFact as sf
INNER JOIN bi.product as p
ON p.ProductID = sf.ProductID
GROUP BY sf.ProductID, p.Product, p.Category, p.Segment"
smry <- dbGetQuery(con, qry)
head(smry)
## ProductID Total Product Category Segment
## 1 1 488 Abbas MA-01 Mix All Season
## 2 3 8515 Abbas MA-03 Mix All Season
## 3 4 1912 Abbas MA-04 Mix All Season
## 4 5 18 Abbas MA-05 Mix All Season
## 5 7 362 Abbas MA-07 Mix All Season
## 6 8 6459 Abbas MA-08 Mix All Season
#class(smry)
Below query will display the top selling product for each category
library(sqldf)
qry0 <- "SELECT Category, Product, Total FROM smry AS X
WHERE Total >= ALL(SELECT Total FROM smry AS Y WHERE X.Category = Y.Category)"
library(sqldf)
# ALL Keyword not supported by sqldf
#sqldf(qry0)
Same as above
qry1 <- "SELECT Category, Product, Total FROM smry AS X
WHERE Total >= (SELECT MAX(Total) FROM smry AS Y WHERE X.Category = Y.Category)"
sqldf(qry1)
## Category Product Total
## 1 Urban Maximus UC-69 120822
## 2 Rural Natura RP-79 113849
## 3 Rural Natura RP-80 113849
## 4 Mix Pirum MA-11 53978
## 5 Youth Aliqui YY-05 52684
This query will only show category and product total for max product
# This will not show the max product name or category
qry1.5 <- "SELECT Category, max(Total)
FROM smry
GROUP BY Category"
sqldf(qry1.5)
## Category max(Total)
## 1 Mix 53978
## 2 Rural 113849
## 3 Urban 120822
## 4 Youth 52684
Select Top N products from each category
# Top keywork not recognized in sqldf package
qry3 <- "SELECT Category, Product, Total FROM smry AS X
WHERE Total IN (SELECT TOP 1 Total FROM smry AS Y
WHERE X.Category = Y.Category
ORDER BY Total)"
#sqldf(qry3)
Data Table Demo
Loading the Package and Data Table
library(data.table)
smry <- data.table(smry)
Category wise aggregation of total Units
smry[, .(GrandTotal = sum(Total)), by = Category][order(-GrandTotal)]
## Category GrandTotal
## 1: Urban 5961949
## 2: Rural 3968287
## 3: Youth 522154
## 4: Mix 367471
Products by Category
How many products are present is each Category
smry[, .(Products=.N), by = Category][order(-Products)]
## Category Products
## 1: Urban 1153
## 2: Rural 625
## 3: Mix 157
## 4: Youth 136
Products by Category and Segment
How many products are present in each Category
smry[, .(Products=.N), by = .(Category, Segment)][order(Category, -Products)]
## Category Segment Products
## 1: Mix All Season 125
## 2: Mix Productivity 32
## 3: Rural Productivity 502
## 4: Rural Select 123
## 5: Urban Convenience 382
## 6: Urban Extreme 317
## 7: Urban Moderation 265
## 8: Urban Regular 189
## 9: Youth Youth 136
Pick up top N Selling Product in Each Category
#smry
N <- 2
smry[order(Category,-Total)][,head(.SD, N), keyby = .(Category)]
## Category ProductID Total Product Segment
## 1: Mix 1053 53978 Pirum MA-11 All Season
## 2: Mix 1916 26257 Currus MA-09 All Season
## 3: Rural 791 113849 Natura RP-79 Productivity
## 4: Rural 792 113849 Natura RP-80 Productivity
## 5: Urban 604 120822 Maximus UC-69 Convenience
## 6: Urban 2332 107074 Aliqui UE-06 Extreme
## 7: Youth 2396 52684 Aliqui YY-05 Youth
## 8: Youth 1000 48719 Natura YY-01 Youth
Mean Sales for each Category
smry[ ,lapply(.SD, mean), by = Category, .SDcols = "Total"]
## Category Total
## 1: Mix 2340.580
## 2: Rural 6349.259
## 3: Urban 5170.814
## 4: Youth 3839.368
Number of Products per Category with Product ID > N
N <-1000
smry[, .N, by = .(Category,ProductID > N)][order(Category,ProductID > N)]
## Category ProductID N
## 1: Mix FALSE 48
## 2: Mix TRUE 109
## 3: Rural FALSE 128
## 4: Rural TRUE 497
## 5: Urban FALSE 683
## 6: Urban TRUE 470
## 7: Youth FALSE 3
## 8: Youth TRUE 133
% Contribution of each Category to total Sales
smry[,sum(Total)/(smry[,sum(Total)]), by = Category]
## Category V1
## 1: Mix 0.03396264
## 2: Rural 0.36675952
## 3: Urban 0.55101900
## 4: Youth 0.04825885
% Contribution of each Segment in Category
#smry1 <- smry[ ,sum(Total), keyby = .(Category, Segment)]
#smry1
#smry1[,V1/sum(V1), keyby = .(Category)]
smry[ ,.(T1 = sum(Total)), keyby = .(Category, Segment)][,.(Segment, P1 = T1/sum(T1)), keyby = .(Category)]
## Category Segment P1
## 1: Mix All Season 0.95479915
## 2: Mix Productivity 0.04520085
## 3: Rural Productivity 0.86927886
## 4: Rural Select 0.13072114
## 5: Urban Convenience 0.46002440
## 6: Urban Extreme 0.23975951
## 7: Urban Moderation 0.26709370
## 8: Urban Regular 0.03312239
## 9: Youth Youth 1.00000000
% Contribution of each Segment in Category
qry7 <- "SELECT s.Category, s.Segment, s.T1*100/p.T2 AS P1
FROM (
SELECT Category, Segment, sum(Total) AS T1
FROM smry
GROUP BY Category, Segment) AS s
JOIN (SELECT Category, SUM(Total) AS T2
FROM smry
GROUP BY Category) AS p
ON p.Category = s.Category
GROUP BY s.Category, s.Segment"
sqldf(qry7)
## Category Segment P1
## 1 Mix All Season 95
## 2 Mix Productivity 4
## 3 Rural Productivity 86
## 4 Rural Select 13
## 5 Urban Convenience 46
## 6 Urban Extreme 23
## 7 Urban Moderation 26
## 8 Urban Regular 3
## 9 Youth Youth 100
Selecting Top Nth Product in Each Category
smry[order(Category, -Total)][ , .SD[1], by = Category]
## Category ProductID Total Product Segment
## 1: Mix 1053 53978 Pirum MA-11 All Season
## 2: Rural 791 113849 Natura RP-79 Productivity
## 3: Urban 604 120822 Maximus UC-69 Convenience
## 4: Youth 2396 52684 Aliqui YY-05 Youth
Selecting Top Nth Product in Each Category using Ranking
qry8 <- "SELECT Category, Segment, Product, Total,
(
SELECT COUNT(*)+1
FROM smry as s2
WHERE s2.Category = s1.Category AND s2.Total > s1.Total
) AS Rnk
FROM smry AS s1
WHERE Rnk = 1
GROUP BY Category, Segment, Product
ORDER BY Category, Rnk"
sqldf(qry8)
## Category Segment Product Total Rnk
## 1 Mix All Season Pirum MA-11 53978 1
## 2 Rural Productivity Natura RP-79 113849 1
## 3 Rural Productivity Natura RP-80 113849 1
## 4 Urban Convenience Maximus UC-69 120822 1
## 5 Youth Youth Aliqui YY-05 52684 1
Data Table reference semantics
smry1 <- smry
smry1[, rank := as.numeric(frank(-Total , ties.method = "min")), by = Category][order(Category, rank)][rank == 1]
## ProductID Total Product Category Segment rank
## 1: 1053 53978 Pirum MA-11 Mix All Season 1
## 2: 791 113849 Natura RP-79 Rural Productivity 1
## 3: 792 113849 Natura RP-80 Rural Productivity 1
## 4: 604 120822 Maximus UC-69 Urban Convenience 1
## 5: 2396 52684 Aliqui YY-05 Youth Youth 1
Setting Keys
setkey(smry1, Category, Segment)
smry1[.("Mix", "All Season"), sum(Total), by = .(Category, Segment)]
## Category Segment V1
## 1: Mix All Season 350861
WITH keyword supported in sqldf. % contribution of each Segment in Category
#head(smry)
smry <- smry1[,-"rank"]
qry10 <- "WITH s AS (
SELECT Category, sum(Total) as Tot
FROM smry
GROUP BY Category)
SELECT s1.Category, s1.Segment, sum(s1.Total)*100/s.Tot
FROM smry AS s1
JOIN s
ON s.Category = s1.Category
GROUP BY s1.Category, s1.Segment"
sqldf(qry10)
## Category Segment sum(s1.Total)*100/s.Tot
## 1 Mix All Season 95
## 2 Mix Productivity 4
## 3 Rural Productivity 86
## 4 Rural Select 13
## 5 Urban Convenience 46
## 6 Urban Extreme 23
## 7 Urban Moderation 26
## 8 Urban Regular 3
## 9 Youth Youth 100
Pivoting
dcast(smry, Category ~ Segment, value.var = "Total")
## Aggregate function missing, defaulting to 'length'
## Category All Season Convenience Extreme Moderation Productivity Regular
## 1: Mix 125 0 0 0 32 0
## 2: Rural 0 0 0 0 502 0
## 3: Urban 0 382 317 265 0 189
## 4: Youth 0 0 0 0 0 0
## Select Youth
## 1: 0 0
## 2: 123 0
## 3: 0 0
## 4: 0 136
temp <- dcast(smry, Category ~ Segment, fun.aggregate = function(x) sum(x), value.var = "Total")
temp
## Category All Season Convenience Extreme Moderation Productivity Regular
## 1: Mix 350861 0 0 0 16610 0
## 2: Rural 0 0 0 0 3449548 0
## 3: Urban 0 2742642 1429434 1592399 0 197474
## 4: Youth 0 0 0 0 0 0
## Select Youth
## 1: 0 0
## 2: 518739 0
## 3: 0 0
## 4: 0 522154
PIVOT Keyword not supported in sqldf
qry11 <- "SELECT *
FROM smry
PIVOT
(
SUM(Total)
FOR Segment
IN ([Extreme], [Regular])
) pt"
# sqldf(qry11)
SELECT *
FROM (
SELECT SUM(sf.Units) as Total, p.Category, p.Segment
FROM bi.salesFact as sf
INNER JOIN bi.product as p
ON p.ProductID = sf.ProductID
GROUP BY sf.ProductID, p.Product, p.Category, p.Segment
) AS smry
PIVOT
(
SUM(Total)
FOR Segment
IN ([Productivity])
) pt
Category | Productivity |
---|---|
Mix | 16610 |
Rural | 3449548 |
Urban | NA |
Youth | NA |