Assignment A02
Exploring Phishing Dataset for Machine Learning with Tidyverse.
Description:
The Phishing dataset is downloaded from kaggle website.
Dataset information
This dataset contains 48 features taken from 5000 phishing web pages and 5000 legitimate web pages downloaded between January and May and June 2015.
Tidyverse and dplyr library would be used.
Importing data from phishing.csv file
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(dplyr)
phishing <- read_csv("phishing.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double()
## )
## i Use `spec()` for the full column specifications.
head(phishing)
## # A tibble: 6 x 50
## id NumDots SubdomainLevel PathLevel UrlLength NumDash NumDashInHostname
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 3 1 5 72 0 0
## 2 2 3 1 3 144 0 0
## 3 3 3 1 2 58 0 0
## 4 4 3 1 6 79 1 0
## 5 5 3 0 4 46 0 0
## 6 6 3 1 1 42 1 0
## # ... with 43 more variables: AtSymbol <dbl>, TildeSymbol <dbl>,
## # NumUnderscore <dbl>, NumPercent <dbl>, NumQueryComponents <dbl>,
## # NumAmpersand <dbl>, NumHash <dbl>, NumNumericChars <dbl>, NoHttps <dbl>,
## # RandomString <dbl>, IpAddress <dbl>, DomainInSubdomains <dbl>,
## # DomainInPaths <dbl>, HttpsInHostname <dbl>, HostnameLength <dbl>,
## # PathLength <dbl>, QueryLength <dbl>, DoubleSlashInPath <dbl>,
## # NumSensitiveWords <dbl>, EmbeddedBrandName <dbl>, PctExtHyperlinks <dbl>,
## # PctExtResourceUrls <dbl>, ExtFavicon <dbl>, InsecureForms <dbl>,
## # RelativeFormAction <dbl>, ExtFormAction <dbl>, AbnormalFormAction <dbl>,
## # PctNullSelfRedirectHyperlinks <dbl>, FrequentDomainNameMismatch <dbl>,
## # FakeLinkInStatusBar <dbl>, RightClickDisabled <dbl>, PopUpWindow <dbl>,
## # SubmitInfoToEmail <dbl>, IframeOrFrame <dbl>, MissingTitle <dbl>,
## # ImagesOnlyInForm <dbl>, SubdomainLevelRT <dbl>, UrlLengthRT <dbl>,
## # PctExtResourceUrlsRT <dbl>, AbnormalExtFormActionR <dbl>,
## # ExtMetaScriptLinkRT <dbl>, PctExtNullSelfRedirectHyperlinksRT <dbl>,
## # CLASS_LABEL <dbl>
names(phishing)
## [1] "id" "NumDots"
## [3] "SubdomainLevel" "PathLevel"
## [5] "UrlLength" "NumDash"
## [7] "NumDashInHostname" "AtSymbol"
## [9] "TildeSymbol" "NumUnderscore"
## [11] "NumPercent" "NumQueryComponents"
## [13] "NumAmpersand" "NumHash"
## [15] "NumNumericChars" "NoHttps"
## [17] "RandomString" "IpAddress"
## [19] "DomainInSubdomains" "DomainInPaths"
## [21] "HttpsInHostname" "HostnameLength"
## [23] "PathLength" "QueryLength"
## [25] "DoubleSlashInPath" "NumSensitiveWords"
## [27] "EmbeddedBrandName" "PctExtHyperlinks"
## [29] "PctExtResourceUrls" "ExtFavicon"
## [31] "InsecureForms" "RelativeFormAction"
## [33] "ExtFormAction" "AbnormalFormAction"
## [35] "PctNullSelfRedirectHyperlinks" "FrequentDomainNameMismatch"
## [37] "FakeLinkInStatusBar" "RightClickDisabled"
## [39] "PopUpWindow" "SubmitInfoToEmail"
## [41] "IframeOrFrame" "MissingTitle"
## [43] "ImagesOnlyInForm" "SubdomainLevelRT"
## [45] "UrlLengthRT" "PctExtResourceUrlsRT"
## [47] "AbnormalExtFormActionR" "ExtMetaScriptLinkRT"
## [49] "PctExtNullSelfRedirectHyperlinksRT" "CLASS_LABEL"
One table verb
- Select to keep variables, here NumDots and SubdomainLevel is kept.
phishing %>%
select(NumDots, SubdomainLevel)
## # A tibble: 10,000 x 2
## NumDots SubdomainLevel
## <dbl> <dbl>
## 1 3 1
## 2 3 1
## 3 3 1
## 4 3 1
## 5 3 0
## 6 3 1
## 7 2 0
## 8 1 0
## 9 8 7
## 10 2 0
## # ... with 9,990 more rows
Arrange in ascending / descending order:
HostnameLength is arranged in descending order.
library(dplyr)
library(tidyverse)
phishing %>%
select(id, EmbeddedBrandName,UrlLength,SubdomainLevel, PathLevel,HostnameLength) %>%
arrange(desc(HostnameLength))
## # A tibble: 10,000 x 6
## id EmbeddedBrandName UrlLength SubdomainLevel PathLevel HostnameLength
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3042 1 178 5 2 137
## 2 2926 0 159 14 8 115
## 3 3693 0 141 4 2 109
## 4 2529 0 157 8 3 94
## 5 2544 0 156 8 3 94
## 6 2555 0 176 8 3 94
## 7 3645 0 138 10 4 86
## 8 101 1 149 11 6 84
## 9 569 1 165 11 6 84
## 10 2083 0 163 7 3 83
## # ... with 9,990 more rows
Mutate to add a new variable:
Mutating joins allow you to combine variables from multiple tables.HostnameLength_PathLength is added as a new variable by mutate.
phishing %>%
mutate(HostnameLength_PathLength = HostnameLength + PathLength) %>%
select(HostnameLength, PathLength, HostnameLength_PathLength) %>%
arrange(desc(HostnameLength_PathLength))
## # A tibble: 10,000 x 3
## HostnameLength PathLength HostnameLength_PathLength
## <dbl> <dbl> <dbl>
## 1 73 120 193
## 2 29 156 185
## 3 25 156 181
## 4 16 159 175
## 5 16 159 175
## 6 20 154 174
## 7 137 34 171
## 8 30 141 171
## 9 10 161 171
## 10 31 138 169
## # ... with 9,990 more rows
Two table verbs
A single data table is rare for a data analysis.We usually have a number of tables for analysis and flexible tools to bring them together.
- Left_join: The left_join keyword returns all records from the left table (table_1), and the matching records from the right table (table_2).
library(dplyr)
library(tidyverse)
table_1 <- phishing %>%
select(id,AbnormalExtFormActionR,PctExtResourceUrlsRT)
table_2 <- phishing %>%
select(id,MissingTitle,ImagesOnlyInForm)
table_1
## # A tibble: 10,000 x 3
## id AbnormalExtFormActionR PctExtResourceUrlsRT
## <dbl> <dbl> <dbl>
## 1 1 1 1
## 2 2 1 1
## 3 3 1 -1
## 4 4 1 1
## 5 5 0 -1
## 6 6 1 1
## 7 7 1 1
## 8 8 1 1
## 9 9 1 1
## 10 10 1 1
## # ... with 9,990 more rows
table_2
## # A tibble: 10,000 x 3
## id MissingTitle ImagesOnlyInForm
## <dbl> <dbl> <dbl>
## 1 1 0 1
## 2 2 0 0
## 3 3 0 0
## 4 4 0 0
## 5 5 0 0
## 6 6 1 0
## 7 7 0 0
## 8 8 0 0
## 9 9 0 0
## 10 10 0 0
## # ... with 9,990 more rows
table_1 %>% left_join(table_2)
## Joining, by = "id"
## # A tibble: 10,000 x 5
## id AbnormalExtFormActio~ PctExtResourceUrls~ MissingTitle ImagesOnlyInForm
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 1 0 1
## 2 2 1 1 0 0
## 3 3 1 -1 0 0
## 4 4 1 1 0 0
## 5 5 0 -1 0 0
## 6 6 1 1 1 0
## 7 7 1 1 0 0
## 8 8 1 1 0 0
## 9 9 1 1 0 0
## 10 10 1 1 0 0
## # ... with 9,990 more rows
- full_join(): Keeps all of the entries in both tables, regardless of whether or not they appear in the other table.
library(dplyr)
library(tidyverse)
table_3 <- phishing %>%
select(id,InsecureForms,RelativeFormAction,ExtFormAction)
table_4 <- phishing %>%
select(id,PctExtHyperlinks,PctExtResourceUrls)
table_3
## # A tibble: 10,000 x 4
## id InsecureForms RelativeFormAction ExtFormAction
## <dbl> <dbl> <dbl> <dbl>
## 1 1 1 0 0
## 2 2 1 0 0
## 3 3 1 0 0
## 4 4 1 0 0
## 5 5 0 0 1
## 6 6 1 0 0
## 7 7 1 0 0
## 8 8 1 0 0
## 9 9 1 0 0
## 10 10 1 0 0
## # ... with 9,990 more rows
table_4
## # A tibble: 10,000 x 3
## id PctExtHyperlinks PctExtResourceUrls
## <dbl> <dbl> <dbl>
## 1 1 0 0.25
## 2 2 0 0
## 3 3 0.375 1
## 4 4 1 0.0952
## 5 5 1 1
## 6 6 0.1 1
## 7 7 0.909 1
## 8 8 0 0
## 9 9 0 0
## 10 10 1 1
## # ... with 9,990 more rows
table_3 %>% full_join(table_4)
## Joining, by = "id"
## # A tibble: 10,000 x 6
## id InsecureForms RelativeFormAction ExtFormAction PctExtHyperlinks
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 0 0 0
## 2 2 1 0 0 0
## 3 3 1 0 0 0.375
## 4 4 1 0 0 1
## 5 5 0 0 1 1
## 6 6 1 0 0 0.1
## 7 7 1 0 0 0.909
## 8 8 1 0 0 0
## 9 9 1 0 0 0
## 10 10 1 0 0 1
## # ... with 9,990 more rows, and 1 more variable: PctExtResourceUrls <dbl>
Grouping
Most data operations are done on groups defined by variables group_by() and converts it into a group where operations are performed “by group”. ungroup() removes grouping.
1)Summarize with groups: It allows us to get a summary row for each group.
library(dplyr)
library(tidyverse)
phishing %>%
group_by(AbnormalExtFormActionR) %>%
summarize(mean_phishing = mean(phishing ),
max_phishing = max(phishing),
count = n())
## Warning in mean.default(phishing): argument is not numeric or logical: returning
## NA
## Warning in mean.default(phishing): argument is not numeric or logical: returning
## NA
## Warning in mean.default(phishing): argument is not numeric or logical: returning
## NA
## # A tibble: 3 x 4
## AbnormalExtFormActionR mean_phishing max_phishing count
## <dbl> <dbl> <dbl> <int>
## 1 -1 NA 10000 537
## 2 0 NA 10000 994
## 3 1 NA 10000 8469
- summarise() removes a layer of grouping
by_NumDots_SubdomainLevel <- phishing %>%
group_by(NumDots, SubdomainLevel)
by_NumDots <- by_NumDots_SubdomainLevel %>%
summarise(n = n())
## `summarise()` has grouped output by 'NumDots'. You can override using the `.groups` argument.
by_NumDots
## # A tibble: 54 x 3
## # Groups: NumDots [17]
## NumDots SubdomainLevel n
## <dbl> <dbl> <int>
## 1 1 0 1959
## 2 2 0 1998
## 3 2 1 2055
## 4 3 0 529
## 5 3 1 2049
## 6 3 2 145
## 7 4 0 222
## 8 4 1 484
## 9 4 2 83
## 10 4 3 6
## # ... with 44 more rows
phishing %>%
group_by(NumDots_SubdomainLevel = NumDots + SubdomainLevel)
## # A tibble: 10,000 x 51
## # Groups: NumDots_SubdomainLevel [23]
## id NumDots SubdomainLevel PathLevel UrlLength NumDash NumDashInHostname
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 3 1 5 72 0 0
## 2 2 3 1 3 144 0 0
## 3 3 3 1 2 58 0 0
## 4 4 3 1 6 79 1 0
## 5 5 3 0 4 46 0 0
## 6 6 3 1 1 42 1 0
## 7 7 2 0 5 60 0 0
## 8 8 1 0 3 30 0 0
## 9 9 8 7 2 76 1 1
## 10 10 2 0 2 46 0 0
## # ... with 9,990 more rows, and 44 more variables: AtSymbol <dbl>,
## # TildeSymbol <dbl>, NumUnderscore <dbl>, NumPercent <dbl>,
## # NumQueryComponents <dbl>, NumAmpersand <dbl>, NumHash <dbl>,
## # NumNumericChars <dbl>, NoHttps <dbl>, RandomString <dbl>, IpAddress <dbl>,
## # DomainInSubdomains <dbl>, DomainInPaths <dbl>, HttpsInHostname <dbl>,
## # HostnameLength <dbl>, PathLength <dbl>, QueryLength <dbl>,
## # DoubleSlashInPath <dbl>, NumSensitiveWords <dbl>, EmbeddedBrandName <dbl>,
## # PctExtHyperlinks <dbl>, PctExtResourceUrls <dbl>, ExtFavicon <dbl>,
## # InsecureForms <dbl>, RelativeFormAction <dbl>, ExtFormAction <dbl>,
## # AbnormalFormAction <dbl>, PctNullSelfRedirectHyperlinks <dbl>,
## # FrequentDomainNameMismatch <dbl>, FakeLinkInStatusBar <dbl>,
## # RightClickDisabled <dbl>, PopUpWindow <dbl>, SubmitInfoToEmail <dbl>,
## # IframeOrFrame <dbl>, MissingTitle <dbl>, ImagesOnlyInForm <dbl>,
## # SubdomainLevelRT <dbl>, UrlLengthRT <dbl>, PctExtResourceUrlsRT <dbl>,
## # AbnormalExtFormActionR <dbl>, ExtMetaScriptLinkRT <dbl>,
## # PctExtNullSelfRedirectHyperlinksRT <dbl>, CLASS_LABEL <dbl>,
## # NumDots_SubdomainLevel <dbl>
Vector function
1)logical operators applied to generate output with rows of data where column PathLevel is between 1 and 5 and rows of data where column NumDash is equal to 1 and column DomainInPaths is not 0.
v <- phishing
v[v$PathLevel > 1 & v$PathLevel < 5,]
## # A tibble: 6,548 x 50
## id NumDots SubdomainLevel PathLevel UrlLength NumDash NumDashInHostname
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 3 1 3 144 0 0
## 2 3 3 1 2 58 0 0
## 3 5 3 0 4 46 0 0
## 4 8 1 0 3 30 0 0
## 5 9 8 7 2 76 1 1
## 6 10 2 0 2 46 0 0
## 7 11 5 4 2 64 1 1
## 8 12 2 0 2 47 0 0
## 9 13 2 1 2 61 1 1
## 10 14 2 1 3 35 0 0
## # ... with 6,538 more rows, and 43 more variables: AtSymbol <dbl>,
## # TildeSymbol <dbl>, NumUnderscore <dbl>, NumPercent <dbl>,
## # NumQueryComponents <dbl>, NumAmpersand <dbl>, NumHash <dbl>,
## # NumNumericChars <dbl>, NoHttps <dbl>, RandomString <dbl>, IpAddress <dbl>,
## # DomainInSubdomains <dbl>, DomainInPaths <dbl>, HttpsInHostname <dbl>,
## # HostnameLength <dbl>, PathLength <dbl>, QueryLength <dbl>,
## # DoubleSlashInPath <dbl>, NumSensitiveWords <dbl>, EmbeddedBrandName <dbl>,
## # PctExtHyperlinks <dbl>, PctExtResourceUrls <dbl>, ExtFavicon <dbl>,
## # InsecureForms <dbl>, RelativeFormAction <dbl>, ExtFormAction <dbl>,
## # AbnormalFormAction <dbl>, PctNullSelfRedirectHyperlinks <dbl>,
## # FrequentDomainNameMismatch <dbl>, FakeLinkInStatusBar <dbl>,
## # RightClickDisabled <dbl>, PopUpWindow <dbl>, SubmitInfoToEmail <dbl>,
## # IframeOrFrame <dbl>, MissingTitle <dbl>, ImagesOnlyInForm <dbl>,
## # SubdomainLevelRT <dbl>, UrlLengthRT <dbl>, PctExtResourceUrlsRT <dbl>,
## # AbnormalExtFormActionR <dbl>, ExtMetaScriptLinkRT <dbl>,
## # PctExtNullSelfRedirectHyperlinksRT <dbl>, CLASS_LABEL <dbl>
v[v$NumDash == 1 & v$DomainInPaths != 0,]
## # A tibble: 780 x 50
## id NumDots SubdomainLevel PathLevel UrlLength NumDash NumDashInHostname
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4 3 1 6 79 1 0
## 2 6 3 1 1 42 1 0
## 3 9 8 7 2 76 1 1
## 4 11 5 4 2 64 1 1
## 5 18 3 1 2 59 1 1
## 6 42 3 1 3 53 1 0
## 7 54 4 1 3 51 1 1
## 8 66 5 1 5 120 1 0
## 9 72 5 1 5 120 1 0
## 10 82 5 0 7 135 1 0
## # ... with 770 more rows, and 43 more variables: AtSymbol <dbl>,
## # TildeSymbol <dbl>, NumUnderscore <dbl>, NumPercent <dbl>,
## # NumQueryComponents <dbl>, NumAmpersand <dbl>, NumHash <dbl>,
## # NumNumericChars <dbl>, NoHttps <dbl>, RandomString <dbl>, IpAddress <dbl>,
## # DomainInSubdomains <dbl>, DomainInPaths <dbl>, HttpsInHostname <dbl>,
## # HostnameLength <dbl>, PathLength <dbl>, QueryLength <dbl>,
## # DoubleSlashInPath <dbl>, NumSensitiveWords <dbl>, EmbeddedBrandName <dbl>,
## # PctExtHyperlinks <dbl>, PctExtResourceUrls <dbl>, ExtFavicon <dbl>,
## # InsecureForms <dbl>, RelativeFormAction <dbl>, ExtFormAction <dbl>,
## # AbnormalFormAction <dbl>, PctNullSelfRedirectHyperlinks <dbl>,
## # FrequentDomainNameMismatch <dbl>, FakeLinkInStatusBar <dbl>,
## # RightClickDisabled <dbl>, PopUpWindow <dbl>, SubmitInfoToEmail <dbl>,
## # IframeOrFrame <dbl>, MissingTitle <dbl>, ImagesOnlyInForm <dbl>,
## # SubdomainLevelRT <dbl>, UrlLengthRT <dbl>, PctExtResourceUrlsRT <dbl>,
## # AbnormalExtFormActionR <dbl>, ExtMetaScriptLinkRT <dbl>,
## # PctExtNullSelfRedirectHyperlinksRT <dbl>, CLASS_LABEL <dbl>
- Vectors can be constructed with c(), :, or seq().
5:10
## [1] 5 6 7 8 9 10
c(seq(2,20,by=2),seq(1,20,by=2))
## [1] 2 4 6 8 10 12 14 16 18 20 1 3 5 7 9 11 13 15 17 19
phishing[c(1,3,5,7,9),]
## # A tibble: 5 x 50
## id NumDots SubdomainLevel PathLevel UrlLength NumDash NumDashInHostname
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 3 1 5 72 0 0
## 2 3 3 1 2 58 0 0
## 3 5 3 0 4 46 0 0
## 4 7 2 0 5 60 0 0
## 5 9 8 7 2 76 1 1
## # ... with 43 more variables: AtSymbol <dbl>, TildeSymbol <dbl>,
## # NumUnderscore <dbl>, NumPercent <dbl>, NumQueryComponents <dbl>,
## # NumAmpersand <dbl>, NumHash <dbl>, NumNumericChars <dbl>, NoHttps <dbl>,
## # RandomString <dbl>, IpAddress <dbl>, DomainInSubdomains <dbl>,
## # DomainInPaths <dbl>, HttpsInHostname <dbl>, HostnameLength <dbl>,
## # PathLength <dbl>, QueryLength <dbl>, DoubleSlashInPath <dbl>,
## # NumSensitiveWords <dbl>, EmbeddedBrandName <dbl>, PctExtHyperlinks <dbl>,
## # PctExtResourceUrls <dbl>, ExtFavicon <dbl>, InsecureForms <dbl>,
## # RelativeFormAction <dbl>, ExtFormAction <dbl>, AbnormalFormAction <dbl>,
## # PctNullSelfRedirectHyperlinks <dbl>, FrequentDomainNameMismatch <dbl>,
## # FakeLinkInStatusBar <dbl>, RightClickDisabled <dbl>, PopUpWindow <dbl>,
## # SubmitInfoToEmail <dbl>, IframeOrFrame <dbl>, MissingTitle <dbl>,
## # ImagesOnlyInForm <dbl>, SubdomainLevelRT <dbl>, UrlLengthRT <dbl>,
## # PctExtResourceUrlsRT <dbl>, AbnormalExtFormActionR <dbl>,
## # ExtMetaScriptLinkRT <dbl>, PctExtNullSelfRedirectHyperlinksRT <dbl>,
## # CLASS_LABEL <dbl>
Pivoting
Describes the use of the new pivot_longer() and pivot_wider() functions. Their goal is to improve the usability of gather() and spread().
- pivot_longer: increasing the number of rows and decreasing the number of columns.
library(tidyverse)
pivot_longer(
phishing,
NumDots,
names_to = "NumDots old",
values_to = "NumDots new"
)
## # A tibble: 10,000 x 51
## id SubdomainLevel PathLevel UrlLength NumDash NumDashInHostname AtSymbol
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 5 72 0 0 0
## 2 2 1 3 144 0 0 0
## 3 3 1 2 58 0 0 0
## 4 4 1 6 79 1 0 0
## 5 5 0 4 46 0 0 0
## 6 6 1 1 42 1 0 0
## 7 7 0 5 60 0 0 0
## 8 8 0 3 30 0 0 0
## 9 9 7 2 76 1 1 0
## 10 10 0 2 46 0 0 0
## # ... with 9,990 more rows, and 44 more variables: TildeSymbol <dbl>,
## # NumUnderscore <dbl>, NumPercent <dbl>, NumQueryComponents <dbl>,
## # NumAmpersand <dbl>, NumHash <dbl>, NumNumericChars <dbl>, NoHttps <dbl>,
## # RandomString <dbl>, IpAddress <dbl>, DomainInSubdomains <dbl>,
## # DomainInPaths <dbl>, HttpsInHostname <dbl>, HostnameLength <dbl>,
## # PathLength <dbl>, QueryLength <dbl>, DoubleSlashInPath <dbl>,
## # NumSensitiveWords <dbl>, EmbeddedBrandName <dbl>, PctExtHyperlinks <dbl>,
## # PctExtResourceUrls <dbl>, ExtFavicon <dbl>, InsecureForms <dbl>,
## # RelativeFormAction <dbl>, ExtFormAction <dbl>, AbnormalFormAction <dbl>,
## # PctNullSelfRedirectHyperlinks <dbl>, FrequentDomainNameMismatch <dbl>,
## # FakeLinkInStatusBar <dbl>, RightClickDisabled <dbl>, PopUpWindow <dbl>,
## # SubmitInfoToEmail <dbl>, IframeOrFrame <dbl>, MissingTitle <dbl>,
## # ImagesOnlyInForm <dbl>, SubdomainLevelRT <dbl>, UrlLengthRT <dbl>,
## # PctExtResourceUrlsRT <dbl>, AbnormalExtFormActionR <dbl>,
## # ExtMetaScriptLinkRT <dbl>, PctExtNullSelfRedirectHyperlinksRT <dbl>,
## # CLASS_LABEL <dbl>, NumDots old <chr>, NumDots new <dbl>
- pivot_wider: Widens data, increasing the number of columns and decreasing the number of rows. The inverse transformation is pivot_longer().
phishing %>%
select(SubdomainLevelRT, UrlLengthRT) %>%
add_count(SubdomainLevelRT, name = "SubdomainLevelRT_n") %>%
pivot_wider(
names_from = SubdomainLevelRT,
values_from = SubdomainLevelRT_n,
names_prefix = "SubdomainLevelRT_",
values_fn = list(SubdomainLevelRT_n = mean)
)
## # A tibble: 3 x 4
## UrlLengthRT SubdomainLevelRT_1 `SubdomainLevelRT_-1` SubdomainLevelRT_0
## <dbl> <dbl> <dbl> <dbl>
## 1 0 9666 100 234
## 2 -1 9666 100 234
## 3 1 9666 100 234
Missing value
The phishing dataset has some missing values.
Following are the 3 tidyr functions that are handy for processing Missing Values
- drop_na()-drops/removes the rows/entries with Missing Values
- fill()- fills the NAs (missing values) in selected columns
- replace_na()-used when you have got the replacement value which the NAs should be filled with.
library(dplyr)
df <- phishing
df$RightClickDisabled[2] <- NA
df$NumDash[5] <- NA
df$NumDashInHostname[5] <- NA
df$SubdomainLevel[10] <- NA
# counting number of missing values
paste("Number of Missing Values", sum(is.na(df)))
## [1] "Number of Missing Values 4"
df_no_na <- drop_na(df)
paste("Number of Missing Values", sum(is.na(df_no_na)))
## [1] "Number of Missing Values 0"
paste("Number of Rows",nrow(df_no_na))
## [1] "Number of Rows 9997"
paste("Number of Columns",ncol(df_no_na))
## [1] "Number of Columns 50"
df_na_filled <- df %>%
fill(
dplyr::everything()
)
# counting number of missing values
paste("Number of Missing Values", sum(is.na(df_na_filled)))
## [1] "Number of Missing Values 0"
paste("Number of Rows",nrow(df_na_filled))
## [1] "Number of Rows 10000"
paste("Number of Columns",ncol(df_na_filled))
## [1] "Number of Columns 50"
# replaced their values with NAs
df_na_replaced <- df %>%
mutate_if(is.numeric, replace_na,0)