Showing posts with label Statistics. Show all posts
Showing posts with label Statistics. Show all posts

Friday, May 27, 2016

Correlation in DAX

In this blogpost I show how to calculate the correlation in DAX. This post will be refined in the future, also to show the comparison with R. The code is shown so you see how to run it in DAX studio. We will investigate the correlation between visits and sales. We define the standard deviation for visits:
    
DEFINE
    MEASURE Visits[sdtotal_visits] =
        CALCULATE (
            STDEVX.P ( VALUES ( 'Visit Date'[Month] ), [Completed Visits] ),
            ALLSELECTED ( 'Visit Date' )
        )

We define the mean for visits:
    
    MEASURE Visits[meantotal_visits] =
        CALCULATE (
            AVERAGEX ( VALUES ( 'Visit Date'[Month] ), [Completed Visits] ),
            ALLSELECTED ( 'Visit Date' )
        )
Same for value
    
    MEASURE Visits[sdtotal_ov] =
        CALCULATE (
            STDEVX.P ( VALUES ( 'Visit Date'[Month] ), [Order Value] ),
            ALLSELECTED ( 'Visit Date' )
        )
    MEASURE Visits[meantotal_ov] =
        CALCULATE (
            AVERAGEX ( VALUES ( 'Visit Date'[Month] ), [Order Value] ),
            ALLSELECTED ( 'Visit Date' )
        )
We multiply the two standard deviations:
    
MEASURE Visits[sdsd] =
        CALCULATE ( [sdtotal_visits] * [sdtotal_ov] )
We calculate the deviation from the mean for each measure:
    
    MEASURE Visits[afwijking_visits] =
        CALCULATE ( ( [Completed Visits] - [meantotal_visits] ) )
    MEASURE Visits[afwijking_value] =
        CALCULATE ( ( [Order Value] - [meantotal_ov] ) )
We calculate the average of the product of the two deviations:
    
MEASURE Visits[avgproduct] =
        CALCULATE (
            AVERAGEX (
                VALUES ( 'Visit Date'[Month] ),
                [afwijking_visits] * [afwijking_value]
            ),
            ALLSELECTED ( 'Visit Date' )
        )
We devide this value of the product of the two SD's
    
    MEASURE Visits[correlation2] =
        CALCULATE ( [avgproduct] / [sdsd] )
Now the query to view the results:
    
EVALUATE
CALCULATETABLE (
    ADDCOLUMNS (
        SUMMARIZE ( 'Visit Date', 'Visit Date'[Month] ),
        "Visits", [Completed Visits],
        "order value", [Order Value],
        "correlation2", [correlation2]
    ),
    'Visit Date'[Month Key] = "2015M12"
        || 'Visit Date'[Month Key] = "2016M01"
        || 'Visit Date'[Month Key] = "2016M02"
)

Sunday, November 1, 2015

Inference on the slope

Inference on the slope

In this post I will explore the concept of inference on the slope. When you fit a model you get a confidence interval for the slope. This is explained in this video.

In this post I will simulate this to get a better understanding of the theory.

set.seed(300)
n=100
x <- c(1:n) 
beta0 <- 3 # intercept
beta1 <- 1.8 # slope
y <- beta0 +  beta1 * x + rnorm(n,mean = 0,sd = 20) # sd of error is 6
fit <- lm(y ~ x)
plot(x,y)
abline(fit)

f = summary(fit)
e = resid(fit)

#slope
f$coef[2,1]
## [1] 1.753188
#standard error of the slope
f$coef[2,2]
## [1] 0.06440277
#standard error of the slope, calculated manually
(sqrt(sum(e^2)/(n-2))) / sqrt(((sum(x^2)) - sum(x)^2/100))
## [1] 0.06440277

What if we fit a model 1000 times

library(ggplot2)

ss = data.frame(1,1)
reeks <- c(1:1000)
for (i in reeks) {
  y <- beta0 +  beta1 * x + rnorm(n,mean = 0,sd = 20) # sd of error is 6
  fit <- lm(y ~ x)
  ss[i,1] = summary(fit)$coef[2,1]
  ss[i,2] = summary(fit)$coef[1,1]
  }

colnames(ss) = c('slopes', 'intercept')
ds = data.frame(cbind(x,y)); colnames(ds) = c('x','y')

g = ggplot(ds, aes(x,y)) + geom_point()
g + geom_abline(data=ss[1:150,], aes(slope=slopes, intercept=intercept, alpha="0.005"))

ggplot(ss, aes(slopes)) +  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

mean(ss[,1])
## [1] 1.800199
sd(ss[,1])
## [1] 0.07030048