Assignment A02

Exploring Phishing Dataset for Machine Learning with Tidyverse.

Description:

The Phishing dataset is downloaded from kaggle website.

  1. Dataset information

    This dataset contains 48 features taken from 5000 phishing web pages and 5000 legitimate web pages downloaded between January and May and June 2015.

  2. Tidyverse and dplyr library would be used.

Importing data from phishing.csv file

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(dplyr)

phishing <- read_csv("phishing.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double()
## )
## i Use `spec()` for the full column specifications.
head(phishing)
## # A tibble: 6 x 50
##      id NumDots SubdomainLevel PathLevel UrlLength NumDash NumDashInHostname
##   <dbl>   <dbl>          <dbl>     <dbl>     <dbl>   <dbl>             <dbl>
## 1     1       3              1         5        72       0                 0
## 2     2       3              1         3       144       0                 0
## 3     3       3              1         2        58       0                 0
## 4     4       3              1         6        79       1                 0
## 5     5       3              0         4        46       0                 0
## 6     6       3              1         1        42       1                 0
## # ... with 43 more variables: AtSymbol <dbl>, TildeSymbol <dbl>,
## #   NumUnderscore <dbl>, NumPercent <dbl>, NumQueryComponents <dbl>,
## #   NumAmpersand <dbl>, NumHash <dbl>, NumNumericChars <dbl>, NoHttps <dbl>,
## #   RandomString <dbl>, IpAddress <dbl>, DomainInSubdomains <dbl>,
## #   DomainInPaths <dbl>, HttpsInHostname <dbl>, HostnameLength <dbl>,
## #   PathLength <dbl>, QueryLength <dbl>, DoubleSlashInPath <dbl>,
## #   NumSensitiveWords <dbl>, EmbeddedBrandName <dbl>, PctExtHyperlinks <dbl>,
## #   PctExtResourceUrls <dbl>, ExtFavicon <dbl>, InsecureForms <dbl>,
## #   RelativeFormAction <dbl>, ExtFormAction <dbl>, AbnormalFormAction <dbl>,
## #   PctNullSelfRedirectHyperlinks <dbl>, FrequentDomainNameMismatch <dbl>,
## #   FakeLinkInStatusBar <dbl>, RightClickDisabled <dbl>, PopUpWindow <dbl>,
## #   SubmitInfoToEmail <dbl>, IframeOrFrame <dbl>, MissingTitle <dbl>,
## #   ImagesOnlyInForm <dbl>, SubdomainLevelRT <dbl>, UrlLengthRT <dbl>,
## #   PctExtResourceUrlsRT <dbl>, AbnormalExtFormActionR <dbl>,
## #   ExtMetaScriptLinkRT <dbl>, PctExtNullSelfRedirectHyperlinksRT <dbl>,
## #   CLASS_LABEL <dbl>
names(phishing)
##  [1] "id"                                 "NumDots"                           
##  [3] "SubdomainLevel"                     "PathLevel"                         
##  [5] "UrlLength"                          "NumDash"                           
##  [7] "NumDashInHostname"                  "AtSymbol"                          
##  [9] "TildeSymbol"                        "NumUnderscore"                     
## [11] "NumPercent"                         "NumQueryComponents"                
## [13] "NumAmpersand"                       "NumHash"                           
## [15] "NumNumericChars"                    "NoHttps"                           
## [17] "RandomString"                       "IpAddress"                         
## [19] "DomainInSubdomains"                 "DomainInPaths"                     
## [21] "HttpsInHostname"                    "HostnameLength"                    
## [23] "PathLength"                         "QueryLength"                       
## [25] "DoubleSlashInPath"                  "NumSensitiveWords"                 
## [27] "EmbeddedBrandName"                  "PctExtHyperlinks"                  
## [29] "PctExtResourceUrls"                 "ExtFavicon"                        
## [31] "InsecureForms"                      "RelativeFormAction"                
## [33] "ExtFormAction"                      "AbnormalFormAction"                
## [35] "PctNullSelfRedirectHyperlinks"      "FrequentDomainNameMismatch"        
## [37] "FakeLinkInStatusBar"                "RightClickDisabled"                
## [39] "PopUpWindow"                        "SubmitInfoToEmail"                 
## [41] "IframeOrFrame"                      "MissingTitle"                      
## [43] "ImagesOnlyInForm"                   "SubdomainLevelRT"                  
## [45] "UrlLengthRT"                        "PctExtResourceUrlsRT"              
## [47] "AbnormalExtFormActionR"             "ExtMetaScriptLinkRT"               
## [49] "PctExtNullSelfRedirectHyperlinksRT" "CLASS_LABEL"

One table verb

  1. Select to keep variables, here NumDots and SubdomainLevel is kept.
phishing %>%
  select(NumDots, SubdomainLevel)
## # A tibble: 10,000 x 2
##    NumDots SubdomainLevel
##      <dbl>          <dbl>
##  1       3              1
##  2       3              1
##  3       3              1
##  4       3              1
##  5       3              0
##  6       3              1
##  7       2              0
##  8       1              0
##  9       8              7
## 10       2              0
## # ... with 9,990 more rows
  1. Arrange in ascending / descending order:

    HostnameLength is arranged in descending order.

library(dplyr)
library(tidyverse)

phishing %>%
  select(id, EmbeddedBrandName,UrlLength,SubdomainLevel, PathLevel,HostnameLength) %>%
  arrange(desc(HostnameLength)) 
## # A tibble: 10,000 x 6
##       id EmbeddedBrandName UrlLength SubdomainLevel PathLevel HostnameLength
##    <dbl>             <dbl>     <dbl>          <dbl>     <dbl>          <dbl>
##  1  3042                 1       178              5         2            137
##  2  2926                 0       159             14         8            115
##  3  3693                 0       141              4         2            109
##  4  2529                 0       157              8         3             94
##  5  2544                 0       156              8         3             94
##  6  2555                 0       176              8         3             94
##  7  3645                 0       138             10         4             86
##  8   101                 1       149             11         6             84
##  9   569                 1       165             11         6             84
## 10  2083                 0       163              7         3             83
## # ... with 9,990 more rows
  1. Mutate to add a new variable:

    Mutating joins allow you to combine variables from multiple tables.HostnameLength_PathLength is added as a new variable by mutate.

phishing %>%
  mutate(HostnameLength_PathLength = HostnameLength + PathLength) %>%
  select(HostnameLength, PathLength, HostnameLength_PathLength) %>%
  arrange(desc(HostnameLength_PathLength))
## # A tibble: 10,000 x 3
##    HostnameLength PathLength HostnameLength_PathLength
##             <dbl>      <dbl>                     <dbl>
##  1             73        120                       193
##  2             29        156                       185
##  3             25        156                       181
##  4             16        159                       175
##  5             16        159                       175
##  6             20        154                       174
##  7            137         34                       171
##  8             30        141                       171
##  9             10        161                       171
## 10             31        138                       169
## # ... with 9,990 more rows

Two table verbs

A single data table is rare for a data analysis.We usually have a number of tables for analysis and flexible tools to bring them together.

  1. Left_join: The left_join keyword returns all records from the left table (table_1), and the matching records from the right table (table_2).
library(dplyr)
library(tidyverse)

table_1  <- phishing   %>%
  select(id,AbnormalExtFormActionR,PctExtResourceUrlsRT)
table_2 <- phishing  %>%
  select(id,MissingTitle,ImagesOnlyInForm)


table_1
## # A tibble: 10,000 x 3
##       id AbnormalExtFormActionR PctExtResourceUrlsRT
##    <dbl>                  <dbl>                <dbl>
##  1     1                      1                    1
##  2     2                      1                    1
##  3     3                      1                   -1
##  4     4                      1                    1
##  5     5                      0                   -1
##  6     6                      1                    1
##  7     7                      1                    1
##  8     8                      1                    1
##  9     9                      1                    1
## 10    10                      1                    1
## # ... with 9,990 more rows
table_2
## # A tibble: 10,000 x 3
##       id MissingTitle ImagesOnlyInForm
##    <dbl>        <dbl>            <dbl>
##  1     1            0                1
##  2     2            0                0
##  3     3            0                0
##  4     4            0                0
##  5     5            0                0
##  6     6            1                0
##  7     7            0                0
##  8     8            0                0
##  9     9            0                0
## 10    10            0                0
## # ... with 9,990 more rows
table_1  %>%  left_join(table_2)
## Joining, by = "id"
## # A tibble: 10,000 x 5
##       id AbnormalExtFormActio~ PctExtResourceUrls~ MissingTitle ImagesOnlyInForm
##    <dbl>                 <dbl>               <dbl>        <dbl>            <dbl>
##  1     1                     1                   1            0                1
##  2     2                     1                   1            0                0
##  3     3                     1                  -1            0                0
##  4     4                     1                   1            0                0
##  5     5                     0                  -1            0                0
##  6     6                     1                   1            1                0
##  7     7                     1                   1            0                0
##  8     8                     1                   1            0                0
##  9     9                     1                   1            0                0
## 10    10                     1                   1            0                0
## # ... with 9,990 more rows
  1. full_join(): Keeps all of the entries in both tables, regardless of whether or not they appear in the other table.
library(dplyr)
library(tidyverse)

table_3  <- phishing  %>%
  select(id,InsecureForms,RelativeFormAction,ExtFormAction)
table_4 <- phishing  %>%
  select(id,PctExtHyperlinks,PctExtResourceUrls)


table_3
## # A tibble: 10,000 x 4
##       id InsecureForms RelativeFormAction ExtFormAction
##    <dbl>         <dbl>              <dbl>         <dbl>
##  1     1             1                  0             0
##  2     2             1                  0             0
##  3     3             1                  0             0
##  4     4             1                  0             0
##  5     5             0                  0             1
##  6     6             1                  0             0
##  7     7             1                  0             0
##  8     8             1                  0             0
##  9     9             1                  0             0
## 10    10             1                  0             0
## # ... with 9,990 more rows
table_4
## # A tibble: 10,000 x 3
##       id PctExtHyperlinks PctExtResourceUrls
##    <dbl>            <dbl>              <dbl>
##  1     1            0                 0.25  
##  2     2            0                 0     
##  3     3            0.375             1     
##  4     4            1                 0.0952
##  5     5            1                 1     
##  6     6            0.1               1     
##  7     7            0.909             1     
##  8     8            0                 0     
##  9     9            0                 0     
## 10    10            1                 1     
## # ... with 9,990 more rows
table_3  %>%  full_join(table_4)
## Joining, by = "id"
## # A tibble: 10,000 x 6
##       id InsecureForms RelativeFormAction ExtFormAction PctExtHyperlinks
##    <dbl>         <dbl>              <dbl>         <dbl>            <dbl>
##  1     1             1                  0             0            0    
##  2     2             1                  0             0            0    
##  3     3             1                  0             0            0.375
##  4     4             1                  0             0            1    
##  5     5             0                  0             1            1    
##  6     6             1                  0             0            0.1  
##  7     7             1                  0             0            0.909
##  8     8             1                  0             0            0    
##  9     9             1                  0             0            0    
## 10    10             1                  0             0            1    
## # ... with 9,990 more rows, and 1 more variable: PctExtResourceUrls <dbl>

Grouping

Most data operations are done on groups defined by variables group_by() and converts it into a group where operations are performed “by group”. ungroup() removes grouping.

1)Summarize with groups: It allows us to get a summary row for each group.

library(dplyr)
library(tidyverse)

  
  phishing %>% 
  group_by(AbnormalExtFormActionR) %>% 
  summarize(mean_phishing  = mean(phishing ),
  max_phishing = max(phishing),
            count = n())
## Warning in mean.default(phishing): argument is not numeric or logical: returning
## NA

## Warning in mean.default(phishing): argument is not numeric or logical: returning
## NA

## Warning in mean.default(phishing): argument is not numeric or logical: returning
## NA
## # A tibble: 3 x 4
##   AbnormalExtFormActionR mean_phishing max_phishing count
##                    <dbl>         <dbl>        <dbl> <int>
## 1                     -1            NA        10000   537
## 2                      0            NA        10000   994
## 3                      1            NA        10000  8469
  1. summarise() removes a layer of grouping
by_NumDots_SubdomainLevel <- phishing %>% 
  group_by(NumDots, SubdomainLevel)

by_NumDots <- by_NumDots_SubdomainLevel %>% 
  summarise(n = n())
## `summarise()` has grouped output by 'NumDots'. You can override using the `.groups` argument.
by_NumDots
## # A tibble: 54 x 3
## # Groups:   NumDots [17]
##    NumDots SubdomainLevel     n
##      <dbl>          <dbl> <int>
##  1       1              0  1959
##  2       2              0  1998
##  3       2              1  2055
##  4       3              0   529
##  5       3              1  2049
##  6       3              2   145
##  7       4              0   222
##  8       4              1   484
##  9       4              2    83
## 10       4              3     6
## # ... with 44 more rows
phishing %>% 
  group_by(NumDots_SubdomainLevel = NumDots + SubdomainLevel)
## # A tibble: 10,000 x 51
## # Groups:   NumDots_SubdomainLevel [23]
##       id NumDots SubdomainLevel PathLevel UrlLength NumDash NumDashInHostname
##    <dbl>   <dbl>          <dbl>     <dbl>     <dbl>   <dbl>             <dbl>
##  1     1       3              1         5        72       0                 0
##  2     2       3              1         3       144       0                 0
##  3     3       3              1         2        58       0                 0
##  4     4       3              1         6        79       1                 0
##  5     5       3              0         4        46       0                 0
##  6     6       3              1         1        42       1                 0
##  7     7       2              0         5        60       0                 0
##  8     8       1              0         3        30       0                 0
##  9     9       8              7         2        76       1                 1
## 10    10       2              0         2        46       0                 0
## # ... with 9,990 more rows, and 44 more variables: AtSymbol <dbl>,
## #   TildeSymbol <dbl>, NumUnderscore <dbl>, NumPercent <dbl>,
## #   NumQueryComponents <dbl>, NumAmpersand <dbl>, NumHash <dbl>,
## #   NumNumericChars <dbl>, NoHttps <dbl>, RandomString <dbl>, IpAddress <dbl>,
## #   DomainInSubdomains <dbl>, DomainInPaths <dbl>, HttpsInHostname <dbl>,
## #   HostnameLength <dbl>, PathLength <dbl>, QueryLength <dbl>,
## #   DoubleSlashInPath <dbl>, NumSensitiveWords <dbl>, EmbeddedBrandName <dbl>,
## #   PctExtHyperlinks <dbl>, PctExtResourceUrls <dbl>, ExtFavicon <dbl>,
## #   InsecureForms <dbl>, RelativeFormAction <dbl>, ExtFormAction <dbl>,
## #   AbnormalFormAction <dbl>, PctNullSelfRedirectHyperlinks <dbl>,
## #   FrequentDomainNameMismatch <dbl>, FakeLinkInStatusBar <dbl>,
## #   RightClickDisabled <dbl>, PopUpWindow <dbl>, SubmitInfoToEmail <dbl>,
## #   IframeOrFrame <dbl>, MissingTitle <dbl>, ImagesOnlyInForm <dbl>,
## #   SubdomainLevelRT <dbl>, UrlLengthRT <dbl>, PctExtResourceUrlsRT <dbl>,
## #   AbnormalExtFormActionR <dbl>, ExtMetaScriptLinkRT <dbl>,
## #   PctExtNullSelfRedirectHyperlinksRT <dbl>, CLASS_LABEL <dbl>,
## #   NumDots_SubdomainLevel <dbl>

Vector function

1)logical operators applied to generate output with rows of data where column PathLevel is between 1 and 5 and rows of data where column NumDash is equal to 1 and column DomainInPaths is not 0.

v <- phishing


 v[v$PathLevel > 1 & v$PathLevel < 5,]
## # A tibble: 6,548 x 50
##       id NumDots SubdomainLevel PathLevel UrlLength NumDash NumDashInHostname
##    <dbl>   <dbl>          <dbl>     <dbl>     <dbl>   <dbl>             <dbl>
##  1     2       3              1         3       144       0                 0
##  2     3       3              1         2        58       0                 0
##  3     5       3              0         4        46       0                 0
##  4     8       1              0         3        30       0                 0
##  5     9       8              7         2        76       1                 1
##  6    10       2              0         2        46       0                 0
##  7    11       5              4         2        64       1                 1
##  8    12       2              0         2        47       0                 0
##  9    13       2              1         2        61       1                 1
## 10    14       2              1         3        35       0                 0
## # ... with 6,538 more rows, and 43 more variables: AtSymbol <dbl>,
## #   TildeSymbol <dbl>, NumUnderscore <dbl>, NumPercent <dbl>,
## #   NumQueryComponents <dbl>, NumAmpersand <dbl>, NumHash <dbl>,
## #   NumNumericChars <dbl>, NoHttps <dbl>, RandomString <dbl>, IpAddress <dbl>,
## #   DomainInSubdomains <dbl>, DomainInPaths <dbl>, HttpsInHostname <dbl>,
## #   HostnameLength <dbl>, PathLength <dbl>, QueryLength <dbl>,
## #   DoubleSlashInPath <dbl>, NumSensitiveWords <dbl>, EmbeddedBrandName <dbl>,
## #   PctExtHyperlinks <dbl>, PctExtResourceUrls <dbl>, ExtFavicon <dbl>,
## #   InsecureForms <dbl>, RelativeFormAction <dbl>, ExtFormAction <dbl>,
## #   AbnormalFormAction <dbl>, PctNullSelfRedirectHyperlinks <dbl>,
## #   FrequentDomainNameMismatch <dbl>, FakeLinkInStatusBar <dbl>,
## #   RightClickDisabled <dbl>, PopUpWindow <dbl>, SubmitInfoToEmail <dbl>,
## #   IframeOrFrame <dbl>, MissingTitle <dbl>, ImagesOnlyInForm <dbl>,
## #   SubdomainLevelRT <dbl>, UrlLengthRT <dbl>, PctExtResourceUrlsRT <dbl>,
## #   AbnormalExtFormActionR <dbl>, ExtMetaScriptLinkRT <dbl>,
## #   PctExtNullSelfRedirectHyperlinksRT <dbl>, CLASS_LABEL <dbl>
 v[v$NumDash == 1 & v$DomainInPaths  != 0,]
## # A tibble: 780 x 50
##       id NumDots SubdomainLevel PathLevel UrlLength NumDash NumDashInHostname
##    <dbl>   <dbl>          <dbl>     <dbl>     <dbl>   <dbl>             <dbl>
##  1     4       3              1         6        79       1                 0
##  2     6       3              1         1        42       1                 0
##  3     9       8              7         2        76       1                 1
##  4    11       5              4         2        64       1                 1
##  5    18       3              1         2        59       1                 1
##  6    42       3              1         3        53       1                 0
##  7    54       4              1         3        51       1                 1
##  8    66       5              1         5       120       1                 0
##  9    72       5              1         5       120       1                 0
## 10    82       5              0         7       135       1                 0
## # ... with 770 more rows, and 43 more variables: AtSymbol <dbl>,
## #   TildeSymbol <dbl>, NumUnderscore <dbl>, NumPercent <dbl>,
## #   NumQueryComponents <dbl>, NumAmpersand <dbl>, NumHash <dbl>,
## #   NumNumericChars <dbl>, NoHttps <dbl>, RandomString <dbl>, IpAddress <dbl>,
## #   DomainInSubdomains <dbl>, DomainInPaths <dbl>, HttpsInHostname <dbl>,
## #   HostnameLength <dbl>, PathLength <dbl>, QueryLength <dbl>,
## #   DoubleSlashInPath <dbl>, NumSensitiveWords <dbl>, EmbeddedBrandName <dbl>,
## #   PctExtHyperlinks <dbl>, PctExtResourceUrls <dbl>, ExtFavicon <dbl>,
## #   InsecureForms <dbl>, RelativeFormAction <dbl>, ExtFormAction <dbl>,
## #   AbnormalFormAction <dbl>, PctNullSelfRedirectHyperlinks <dbl>,
## #   FrequentDomainNameMismatch <dbl>, FakeLinkInStatusBar <dbl>,
## #   RightClickDisabled <dbl>, PopUpWindow <dbl>, SubmitInfoToEmail <dbl>,
## #   IframeOrFrame <dbl>, MissingTitle <dbl>, ImagesOnlyInForm <dbl>,
## #   SubdomainLevelRT <dbl>, UrlLengthRT <dbl>, PctExtResourceUrlsRT <dbl>,
## #   AbnormalExtFormActionR <dbl>, ExtMetaScriptLinkRT <dbl>,
## #   PctExtNullSelfRedirectHyperlinksRT <dbl>, CLASS_LABEL <dbl>
  1. Vectors can be constructed with c(), :, or seq().
5:10
## [1]  5  6  7  8  9 10
c(seq(2,20,by=2),seq(1,20,by=2))
##  [1]  2  4  6  8 10 12 14 16 18 20  1  3  5  7  9 11 13 15 17 19
phishing[c(1,3,5,7,9),]
## # A tibble: 5 x 50
##      id NumDots SubdomainLevel PathLevel UrlLength NumDash NumDashInHostname
##   <dbl>   <dbl>          <dbl>     <dbl>     <dbl>   <dbl>             <dbl>
## 1     1       3              1         5        72       0                 0
## 2     3       3              1         2        58       0                 0
## 3     5       3              0         4        46       0                 0
## 4     7       2              0         5        60       0                 0
## 5     9       8              7         2        76       1                 1
## # ... with 43 more variables: AtSymbol <dbl>, TildeSymbol <dbl>,
## #   NumUnderscore <dbl>, NumPercent <dbl>, NumQueryComponents <dbl>,
## #   NumAmpersand <dbl>, NumHash <dbl>, NumNumericChars <dbl>, NoHttps <dbl>,
## #   RandomString <dbl>, IpAddress <dbl>, DomainInSubdomains <dbl>,
## #   DomainInPaths <dbl>, HttpsInHostname <dbl>, HostnameLength <dbl>,
## #   PathLength <dbl>, QueryLength <dbl>, DoubleSlashInPath <dbl>,
## #   NumSensitiveWords <dbl>, EmbeddedBrandName <dbl>, PctExtHyperlinks <dbl>,
## #   PctExtResourceUrls <dbl>, ExtFavicon <dbl>, InsecureForms <dbl>,
## #   RelativeFormAction <dbl>, ExtFormAction <dbl>, AbnormalFormAction <dbl>,
## #   PctNullSelfRedirectHyperlinks <dbl>, FrequentDomainNameMismatch <dbl>,
## #   FakeLinkInStatusBar <dbl>, RightClickDisabled <dbl>, PopUpWindow <dbl>,
## #   SubmitInfoToEmail <dbl>, IframeOrFrame <dbl>, MissingTitle <dbl>,
## #   ImagesOnlyInForm <dbl>, SubdomainLevelRT <dbl>, UrlLengthRT <dbl>,
## #   PctExtResourceUrlsRT <dbl>, AbnormalExtFormActionR <dbl>,
## #   ExtMetaScriptLinkRT <dbl>, PctExtNullSelfRedirectHyperlinksRT <dbl>,
## #   CLASS_LABEL <dbl>

Pivoting

Describes the use of the new pivot_longer() and pivot_wider() functions. Their goal is to improve the usability of gather() and spread().

  1. pivot_longer: increasing the number of rows and decreasing the number of columns.
library(tidyverse)

pivot_longer(
 phishing,
  NumDots, 
  names_to = "NumDots old", 
  values_to = "NumDots new"
  )
## # A tibble: 10,000 x 51
##       id SubdomainLevel PathLevel UrlLength NumDash NumDashInHostname AtSymbol
##    <dbl>          <dbl>     <dbl>     <dbl>   <dbl>             <dbl>    <dbl>
##  1     1              1         5        72       0                 0        0
##  2     2              1         3       144       0                 0        0
##  3     3              1         2        58       0                 0        0
##  4     4              1         6        79       1                 0        0
##  5     5              0         4        46       0                 0        0
##  6     6              1         1        42       1                 0        0
##  7     7              0         5        60       0                 0        0
##  8     8              0         3        30       0                 0        0
##  9     9              7         2        76       1                 1        0
## 10    10              0         2        46       0                 0        0
## # ... with 9,990 more rows, and 44 more variables: TildeSymbol <dbl>,
## #   NumUnderscore <dbl>, NumPercent <dbl>, NumQueryComponents <dbl>,
## #   NumAmpersand <dbl>, NumHash <dbl>, NumNumericChars <dbl>, NoHttps <dbl>,
## #   RandomString <dbl>, IpAddress <dbl>, DomainInSubdomains <dbl>,
## #   DomainInPaths <dbl>, HttpsInHostname <dbl>, HostnameLength <dbl>,
## #   PathLength <dbl>, QueryLength <dbl>, DoubleSlashInPath <dbl>,
## #   NumSensitiveWords <dbl>, EmbeddedBrandName <dbl>, PctExtHyperlinks <dbl>,
## #   PctExtResourceUrls <dbl>, ExtFavicon <dbl>, InsecureForms <dbl>,
## #   RelativeFormAction <dbl>, ExtFormAction <dbl>, AbnormalFormAction <dbl>,
## #   PctNullSelfRedirectHyperlinks <dbl>, FrequentDomainNameMismatch <dbl>,
## #   FakeLinkInStatusBar <dbl>, RightClickDisabled <dbl>, PopUpWindow <dbl>,
## #   SubmitInfoToEmail <dbl>, IframeOrFrame <dbl>, MissingTitle <dbl>,
## #   ImagesOnlyInForm <dbl>, SubdomainLevelRT <dbl>, UrlLengthRT <dbl>,
## #   PctExtResourceUrlsRT <dbl>, AbnormalExtFormActionR <dbl>,
## #   ExtMetaScriptLinkRT <dbl>, PctExtNullSelfRedirectHyperlinksRT <dbl>,
## #   CLASS_LABEL <dbl>, NumDots old <chr>, NumDots new <dbl>
  1. pivot_wider: Widens data, increasing the number of columns and decreasing the number of rows. The inverse transformation is pivot_longer().
phishing %>%
  select(SubdomainLevelRT, UrlLengthRT) %>%
  add_count(SubdomainLevelRT, name = "SubdomainLevelRT_n") %>%
  pivot_wider(
    names_from = SubdomainLevelRT,
    values_from = SubdomainLevelRT_n,
    names_prefix = "SubdomainLevelRT_",
    values_fn = list(SubdomainLevelRT_n = mean)
  )
## # A tibble: 3 x 4
##   UrlLengthRT SubdomainLevelRT_1 `SubdomainLevelRT_-1` SubdomainLevelRT_0
##         <dbl>              <dbl>                 <dbl>              <dbl>
## 1           0               9666                   100                234
## 2          -1               9666                   100                234
## 3           1               9666                   100                234

Missing value

The phishing dataset has some missing values.

Following are the 3 tidyr functions that are handy for processing Missing Values

  1. drop_na()-drops/removes the rows/entries with Missing Values
  2. fill()- fills the NAs (missing values) in selected columns
  3. replace_na()-used when you have got the replacement value which the NAs should be filled with.
library(dplyr)

df <- phishing

df$RightClickDisabled[2] <- NA
df$NumDash[5] <- NA
df$NumDashInHostname[5] <- NA
df$SubdomainLevel[10] <- NA

# counting number of missing values
paste("Number of Missing Values", sum(is.na(df)))
## [1] "Number of Missing Values 4"
df_no_na <- drop_na(df)
paste("Number of Missing Values", sum(is.na(df_no_na)))
## [1] "Number of Missing Values 0"
paste("Number of Rows",nrow(df_no_na))
## [1] "Number of Rows 9997"
paste("Number of Columns",ncol(df_no_na))
## [1] "Number of Columns 50"
df_na_filled <- df %>% 
                    fill(
                      dplyr::everything()
                    )


# counting number of missing values
paste("Number of Missing Values", sum(is.na(df_na_filled)))
## [1] "Number of Missing Values 0"
paste("Number of Rows",nrow(df_na_filled))
## [1] "Number of Rows 10000"
paste("Number of Columns",ncol(df_na_filled))
## [1] "Number of Columns 50"
# replaced their values with NAs
df_na_replaced <- df %>% 
                    mutate_if(is.numeric, replace_na,0)