Sasak Sasak -4 years ago 36
R Question

Using term frequency list from another dataframe under a specific threshold

My dataset is this:

sparsedf <- structure(list(colname1 = structure(c(8L, 3L, 4L, 7L, 2L, 6L,
5L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = c("", "price106", "price142", "price185",
"price655", "price67", "price753", "price99"), class = "factor"),
colname2 = structure(c(2L, 3L, 8L, 15L, 5L, 4L, 12L, 9L,
10L, 7L, 11L, 6L, 13L, 14L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = c("", "price100", "price143",
"price16", "price271", "price29", "price3", "price36", "price391",
"price433", "price505", "price56", "price578", "price655",
"price753"), class = "factor"), colname3 = structure(c(2L,
8L, 4L, 5L, 6L, 7L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 19L, 20L, 3L, 18L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("",
"price101", "price106", "price186", "price228", "price272",
"price314", "price33", "price354", "price392", "price434",
"price469", "price506", "price541", "price579", "price615",
"price652", "price67", "price686", "price720"), class = "factor"),
colname4 = structure(c(2L, 3L, 8L, 5L, 9L, 6L, 18L, 7L, 13L,
10L, 19L, 12L, 14L, 16L, 11L, 15L, 4L, 17L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = c("", "price102", "price144",
"price20", "price229", "price315", "price393", "price4",
"price46", "price470", "price52", "price542", "price55",
"price580", "price6", "price616", "price655", "price7", "price753"
), class = "factor"), colname6 = structure(c(1L, 2L, 3L,
4L, 6L, 7L, 8L, 9L, 10L, 11L, 5L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L), .Label = c("price104",
"price146", "price188", "price231", "price25", "price274",
"price317", "price356", "price395", "price436", "price472",
"price544", "price582", "price618", "price654", "price687",
"price722", "price752", "price779", "price809", "price835",
"price857", "price881", "price904", "price926", "price947",
"price966"), class = "factor"), colname7 = structure(c(2L,
4L, 5L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("",
"price105", "price106", "price147", "price189"), class = "factor"),
colname9 = structure(c(2L, 3L, 4L, 5L, 6L, 7L, 11L, 8L, 9L,
10L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("", "price107", "price149", "price191",
"price233", "price276", "price319", "price397", "price438",
"price474", "price57"), class = "factor"), colname11 = structure(c(2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 15L, 16L,
19L, 17L, 10L, 18L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("",
"price109", "price12", "price193", "price235", "price278",
"price321", "price359", "price399", "price40", "price440",
"price475", "price511", "price547", "price585", "price621",
"price689", "price754", "price78"), class = "factor"), colname12 = structure(c(2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 20L, 21L, 22L, 23L, 10L, 24L, 25L, 26L, 27L,
1L), .Label = c("", "price110", "price150", "price194", "price236",
"price279", "price322", "price360", "price400", "price42",
"price441", "price476", "price512", "price548", "price586",
"price622", "price656", "price690", "price725", "price755",
"price782", "price812", "price838", "price884", "price907",
"price929", "price950"), class = "factor"), colname13 = structure(c(3L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 4L, 17L, 12L, 13L, 14L, 15L,
16L, 18L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("",
"price106", "price11", "price12", "price13", "price15", "price237",
"price280", "price323", "price361", "price401", "price513",
"price549", "price587", "price623", "price657", "price67",
"price753"), class = "factor")), .Names = c("colname1", "colname2",
"colname3", "colname4", "colname6", "colname7", "colname9", "colname11",
"colname12", "colname13"), class = "data.frame", row.names = c(NA,
-27L))


I would like using the following list (frequency_term_df), which contains for every term in the first column the name of the term and in the second column the frequency of each term.
I would like to use this list to the above dataframe and remove the terms which contains frequency equals or lower to 2.

Is it possible to make it?

frequency_term_df <- structure(list(name = c("price99", "price100", "price101", "price102",
"price104", "price105", "price107", "price109", "price110", "price11",
"price142", "price143", "price33", "price144", "price146", "price147",
"price149", "price12", "price150", "price13", "price185", "price36",
"price186", "price4", "price188", "price189", "price191", "price193",
"price194", "price15", "price753", "price228", "price229", "price231",
"price106", "price233", "price235", "price236", "price237", "price271",
"price272", "price46", "price274", "", "price276", "price278",
"price279", "price280", "price67", "price16", "price314", "price315",
"price317", "price319", "price321", "price322", "price323", "price655",
"price56", "price354", "price7", "price356", "price57", "price359",
"price360", "price361", "price391", "price392", "price393", "price395",
"price397", "price399", "price400", "price401", "price433", "price434",
"price55", "price436", "price438", "price440", "price441", "price3",
"price469", "price470", "price472", "price474", "price475", "price476",
"price505", "price506", "price25", "price511", "price512", "price513",
"price29", "price541", "price542", "price544", "price547", "price548",
"price549", "price578", "price579", "price580", "price582", "price585",
"price586", "price587", "price615", "price616", "price618", "price621",
"price622", "price623", "price652", "price52", "price654", "price78",
"price656", "price657", "price686", "price6", "price687", "price689",
"price690", "price720", "price20", "price722", "price40", "price725",
"price752", "price754", "price755", "price779", "price782", "price809",
"price812", "price835", "price838", "price857", "price42", "price881",
"price884", "price904", "price907", "price926", "price929", "price947",
"price950", "price966"), Number = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 110L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
)), row.names = c(NA, -150L), class = c("data.table", "data.frame"
), .Names = c("name", "Number"), .internal.selfref = <pointer: 0x00000000003b0788>)

Answer Source

I'm assuming that by "remove" you mean turn the values into blank strings. First, identify the names which have a frequency <= 2.

freq2 <- frequency_term_df$name[frequency_term_df$Number <= 2]

Then use ifelse together with sapply to go through each column and replace the names you don't want with blanks.

sparsedf2 <- as.data.frame(sapply(sparsedf, FUN = function(x) ifelse(x %in% freq2, "", as.character(x))))
Recommended from our users: Dynamic Network Monitoring from WhatsUp Gold from IPSwitch. Free Download