diff --git a/R/ChineseNames.R b/R/ChineseNames.R index 9811322..3f580aa 100644 --- a/R/ChineseNames.R +++ b/R/ChineseNames.R @@ -42,8 +42,7 @@ NULL #' #' @name familyname #' @usage data(familyname) -#' @format -#' A data frame with 7 variables: +#' @format A data frame with 7 variables: #' \describe{ #' \item{\code{surname}}{surname (in Chinese)} #' \item{\code{compound}}{0 = single surname, 1 = compound surname} @@ -61,8 +60,7 @@ NULL #' #' @name givenname #' @usage data(givenname) -#' @format -#' A data frame with 25 variables: +#' @format A data frame with 25 variables: #' \describe{ #' \item{\code{character}}{character used in given names (in Chinese)} #' \item{\code{pinyin}}{pinyin (pronunciation)} @@ -70,15 +68,15 @@ NULL #' \item{\code{n.male}}{total counts in male} #' \item{\code{n.female}}{total counts in female} #' \item{\code{name.gender}}{difference in proportions of a character used by male vs. female} -#' \item{\code{n.1930_1959}, \code{n.1960_1969}, ..., \code{n.2000_2008}}{total counts in a birth cohort} -#' \item{\code{ppm.1930_1959}, \code{ppm.1960_1969}, ..., \code{ppm.2000_2008}}{proportion (parts per million) in a birth cohort} +#' \item{\code{n.1930_1959}, \code{n.1960_1969}, \code{n.1970_1979}, \code{n.1980_1989}, \code{n.1990_1999}, \code{n.2000_2008}}{total counts in a birth cohort} +#' \item{\code{ppm.1930_1959}, \code{ppm.1960_1969}, \code{ppm.1970_1979}, \code{ppm.1980_1989}, \code{ppm.1990_1999}, \code{ppm.2000_2008}}{proportion (parts per million) in a birth cohort} #' \item{\code{name.ppm}}{average ppm (parts per million) across all cohorts} #' \item{\code{name.uniqueness}}{name-character uniqueness (in naming practices)} #' \item{\code{corpus.ppm}}{proportion (parts per million) in contemporary Chinese corpus} #' \item{\code{corpus.uniqueness}}{character-corpus uniqueness (in contemporary Chinese corpus)} -#' \item{\code{name.valence} (based on subjective ratings from 16 raters, ICC = 0.921)}{name valence (positivity of character meaning)} -#' \item{\code{name.warmth} (based on subjective ratings from 10 raters, ICC = 0.774)}{name warmth/morality} -#' \item{\code{name.competence} (based on subjective ratings from 10 raters, ICC = 0.712)}{name competence/assertiveness} +#' \item{\code{name.valence}}{name valence (positivity of character meaning) (based on subjective ratings from 16 raters, ICC = 0.921)} +#' \item{\code{name.warmth}}{name warmth/morality (based on subjective ratings from 10 raters, ICC = 0.774)} +#' \item{\code{name.competence}}{name competence/assertiveness (based on subjective ratings from 10 raters, ICC = 0.712)} #' } #' @details \url{https://github.com/psychbruce/ChineseNames} NULL @@ -112,7 +110,7 @@ NULL NULL -#### Function #### +#### Functions #### `%>%`=dplyr::`%>%` @@ -164,21 +162,7 @@ NULL #' @return #' A new data frame (\code{data.table}) with name indices appended. #' -#' @examples -#' ## compute for one name -#' myname=demodata[1, "name"] -#' mybirth=1995 -#' d=compute_name_index(name=myname, birth=mybirth, index="NU") -#' # use View(d) to see the results -#' -#' ## compute for a dataset with a variable of names -#' data(demodata) # a data frame -#' data=compute_name_index(demodata, -#' var.fullname="name") # not adjusted for birth year -#' data=compute_name_index(demodata, -#' var.fullname="name", -#' var.birthyear="birth") # adjusted for birth year -#' # use View(data) to see the results +#' @note For details and examples, see \url{https://github.com/psychbruce/ChineseNames} #' #' @import data.table #' @importFrom bruceR Print MEAN LOOKUP @@ -197,6 +181,89 @@ compute_name_index=function(data=NULL, digits=4, return.namechar=TRUE, return.all=FALSE) { + ## Prepare ## + + familyname=ChineseNames::familyname + givenname=ChineseNames::givenname + + fuxing=familyname[familyname$compound==1, "surname"] + ref0=givenname$name.ppm; names(ref0)=givenname$character + ref1=givenname$ppm.1930_1959; names(ref1)=givenname$character + ref2=givenname$ppm.1960_1969; names(ref2)=givenname$character + ref3=givenname$ppm.1970_1979; names(ref3)=givenname$character + ref4=givenname$ppm.1980_1989; names(ref4)=givenname$character + ref5=givenname$ppm.1990_1999; names(ref5)=givenname$character + ref6=givenname$ppm.2000_2008; names(ref6)=givenname$character + + compute_NU_char=function(char, year=NA, approx=TRUE) { + raw=!approx + if(is.na(char)) + ppm="NA" + else if(is.na(year)) + ppm=ref0[char] # overall + else if(year<1930) + ppm=ref1[char] # 1930-1959 + else if(year<1960) + ppm=ifelse( + raw | year<1955, + ref1[char], # 1930-1959 + (ref1[char]*(1965-year) + ref2[char]*(year-1955))/10 + ) + else if(year<1970) + ppm=ifelse( + raw, + ref2[char], # 1960-1969 + ifelse(year<1965, + (ref1[char]*(1965-year) + ref2[char]*(year-1955))/10, + (ref2[char]*(1975-year) + ref3[char]*(year-1965))/10) + ) + else if(year<1980) + ppm=ifelse( + raw, + ref3[char], # 1970-1979 + ifelse(year<1975, + (ref2[char]*(1975-year) + ref3[char]*(year-1965))/10, + (ref3[char]*(1985-year) + ref4[char]*(year-1975))/10) + ) + else if(year<1990) + ppm=ifelse( + raw, + ref4[char], # 1980-1989 + ifelse(year<1985, + (ref3[char]*(1985-year) + ref4[char]*(year-1975))/10, + (ref4[char]*(1995-year) + ref5[char]*(year-1985))/10) + ) + else if(year<2000) + ppm=ifelse( + raw, + ref5[char], # 1990-1999 + ifelse(year<1995, + (ref4[char]*(1995-year) + ref5[char]*(year-1985))/10, + (ref5[char]*(2005-year) + ref6[char]*(year-1995))/10) + ) + else if(year<2010) + ppm=ifelse( + raw, + ref6[char], # 2000-2009 (2008) + ifelse(year<2005, + (ref5[char]*(2005-year) + ref6[char]*(year-1995))/10, + ref6[char]) + ) + else + ppm="NA" + if(is.na(ppm)) ppm=0 + if(ppm=="NA") ppm=NA + return(as.numeric( -log10((ppm+1)/10^6) )) + } + + ## Debug ## + + `.`=NULL + NLen=SNU=SNI=NU=CCU=NG=NV=NW=NC=NULL + fx=sur.name=given.name=name0=name1=name2=name3=NULL + + ## Main ## + if(is.na(name)==FALSE) { data=data.frame(name=name, birth=birth) var.fullname="name" @@ -325,75 +392,3 @@ compute_name_index=function(data=NULL, return(data.new) } - -compute_NU_char=function(char, year=NA, approx=TRUE) { - raw=!approx - if(is.na(char)) - ppm="NA" - else if(is.na(year)) - ppm=ref0[char] # overall - else if(year<1930) - ppm=ref1[char] # 1930-1959 - else if(year<1960) - ppm=ifelse( - raw | year<1955, - ref1[char], # 1930-1959 - (ref1[char]*(1965-year) + ref2[char]*(year-1955))/10 - ) - else if(year<1970) - ppm=ifelse( - raw, - ref2[char], # 1960-1969 - ifelse(year<1965, - (ref1[char]*(1965-year) + ref2[char]*(year-1955))/10, - (ref2[char]*(1975-year) + ref3[char]*(year-1965))/10) - ) - else if(year<1980) - ppm=ifelse( - raw, - ref3[char], # 1970-1979 - ifelse(year<1975, - (ref2[char]*(1975-year) + ref3[char]*(year-1965))/10, - (ref3[char]*(1985-year) + ref4[char]*(year-1975))/10) - ) - else if(year<1990) - ppm=ifelse( - raw, - ref4[char], # 1980-1989 - ifelse(year<1985, - (ref3[char]*(1985-year) + ref4[char]*(year-1975))/10, - (ref4[char]*(1995-year) + ref5[char]*(year-1985))/10) - ) - else if(year<2000) - ppm=ifelse( - raw, - ref5[char], # 1990-1999 - ifelse(year<1995, - (ref4[char]*(1995-year) + ref5[char]*(year-1985))/10, - (ref5[char]*(2005-year) + ref6[char]*(year-1995))/10) - ) - else if(year<2010) - ppm=ifelse( - raw, - ref6[char], # 2000-2009 (2008) - ifelse(year<2005, - (ref5[char]*(2005-year) + ref6[char]*(year-1995))/10, - (ref6[char]*(2015-year) + ref7[char]*(year-2005))/10) - ) - else if(year<2020) - ppm=ifelse( - raw, - ref7[char], # 2010-2019 (forecast by time-series models) - ifelse(year<2015, - (ref6[char]*(2015-year) + ref7[char]*(year-2005))/10, - (ref7[char]*(2025-year) + ref8[char]*(year-2015))/10) - ) - else if(year<2030) - ppm=ref8[char] # 2020-2029 (forecast by time-series models) - else - ppm="NA" - if(is.na(ppm)) ppm=0 - if(ppm=="NA") ppm=NA - return(as.numeric( -log10((ppm+1)/10^6) )) -} - diff --git a/README.md b/README.md index 4363b42..fb88ad3 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ This name database contains nationwide frequency statistics for 1,806 Chinese su This package also contains a function that can compute indices of Chinese surnames and given names for scientific research (e.g., name uniqueness, name gender, name positivity, name warmth/competence). -[![CRAN-Version](https://www.r-pkg.org/badges/version/ChineseNames?color=red)](https://CRAN.R-project.org/package=ChineseNames) [![CRAN-Downloads](https://cranlogs.r-pkg.org/badges/grand-total/ChineseNames)](https://www.rdocumentation.org/packages/ChineseNames) [![GitHub-Version](https://img.shields.io/github/r-package/v/psychbruce/ChineseNames?label=GitHub)](https://github.com/psychbruce/ChineseNames) [![Travis-Build-Status](https://travis-ci.com/psychbruce/ChineseNames.svg?branch=master)](https://travis-ci.com/psychbruce/ChineseNames) [![GitHub-Stars](https://img.shields.io/github/stars/psychbruce/ChineseNames?style=social)](https://github.com/psychbruce/ChineseNames/stargazers) +[![CRAN-Version](https://www.r-pkg.org/badges/version/ChineseNames?color=red)](https://CRAN.R-project.org/package=ChineseNames) [![CRAN-Downloads](https://cranlogs.r-pkg.org/badges/grand-total/ChineseNames)](https://www.rdocumentation.org/packages/ChineseNames) [![GitHub-Version](https://img.shields.io/github/r-package/v/psychbruce/ChineseNames?label=GitHub&color=orange)](https://github.com/psychbruce/ChineseNames) [![Travis-Build-Status](https://travis-ci.com/psychbruce/ChineseNames.svg?branch=master)](https://travis-ci.com/psychbruce/ChineseNames) [![GitHub-Stars](https://img.shields.io/github/stars/psychbruce/ChineseNames?style=social)](https://github.com/psychbruce/ChineseNames/stargazers) @@ -66,9 +66,9 @@ This package includes five datasets (`data.frame` in R): *Note*. The "ppm" in variable names of these datasets means "parts per million (百万分率)" (e.g., ppm = 1 means a proportion of 1/106). -### Compute Name Variables +### Compute Name Indices -**Use the `compute_name_index()` function.** This function computes multiple indices of Chinese surnames and given names for scientific research. Just input a data frame with full names (and birth year, if necessary), then it returns a new data frame with all name variables appended. +**Use the `compute_name_index()` function.** This function computes multiple indices of Chinese surnames and given names for scientific research. Just input a data frame with full names (and birth year, if necessary), then it returns a new data frame with all name indices appended. Example: @@ -76,8 +76,11 @@ Example: library(ChineseNames) ?compute_name_index # see usage in help page -demodata # a data frame with full names and birth years -View(demodata) # two variables: "name", "birth" +compute_name_index(name="包寒吴霜", birth=1995, index="NU") + +demodata=data.frame( + name=c("包寒吴霜", "陈俊霖", "张伟", "张炜", "欧阳修", "欧阳", "易烊千玺", "张艺谋", "王的"), + birth=c(1995, 1995, 1985, 1988, 1968, 2010, 2000, 1950, 2005)) newdata=compute_name_index( demodata, diff --git a/data/demodata.rda b/data/demodata.rda deleted file mode 100644 index be38a97..0000000 Binary files a/data/demodata.rda and /dev/null differ diff --git a/data/fuxing.rda b/data/fuxing.rda deleted file mode 100644 index 1a09fcc..0000000 Binary files a/data/fuxing.rda and /dev/null differ diff --git a/data/ref0.rda b/data/ref0.rda deleted file mode 100644 index cbd647f..0000000 Binary files a/data/ref0.rda and /dev/null differ diff --git a/data/ref1.rda b/data/ref1.rda deleted file mode 100644 index 281b6a9..0000000 Binary files a/data/ref1.rda and /dev/null differ diff --git a/data/ref2.rda b/data/ref2.rda deleted file mode 100644 index 46aeff1..0000000 Binary files a/data/ref2.rda and /dev/null differ diff --git a/data/ref3.rda b/data/ref3.rda deleted file mode 100644 index 0891e7f..0000000 Binary files a/data/ref3.rda and /dev/null differ diff --git a/data/ref4.rda b/data/ref4.rda deleted file mode 100644 index 0f8500d..0000000 Binary files a/data/ref4.rda and /dev/null differ diff --git a/data/ref5.rda b/data/ref5.rda deleted file mode 100644 index 2f6dfb2..0000000 Binary files a/data/ref5.rda and /dev/null differ diff --git a/data/ref6.rda b/data/ref6.rda deleted file mode 100644 index 44018a7..0000000 Binary files a/data/ref6.rda and /dev/null differ diff --git a/data/ref7.rda b/data/ref7.rda deleted file mode 100644 index c66213d..0000000 Binary files a/data/ref7.rda and /dev/null differ diff --git a/data/ref8.rda b/data/ref8.rda deleted file mode 100644 index df69d78..0000000 Binary files a/data/ref8.rda and /dev/null differ diff --git a/man/compute_name_index.Rd b/man/compute_name_index.Rd index 48ffe34..0247270 100644 --- a/man/compute_name_index.Rd +++ b/man/compute_name_index.Rd @@ -76,20 +76,6 @@ You can either input \code{data} with a variable of Chinese names or just input a vector of \code{name} (and \code{birth} year, if necessary). } -\examples{ -## compute for one name -myname=demodata[1, "name"] -mybirth=1995 -d=compute_name_index(name=myname, birth=mybirth, index="NU") -# use View(d) to see the results - -## compute for a dataset with a variable of names -data(demodata) # a data frame -data=compute_name_index(demodata, - var.fullname="name") # not adjusted for birth year -data=compute_name_index(demodata, - var.fullname="name", - var.birthyear="birth") # adjusted for birth year -# use View(data) to see the results - +\note{ +For details and examples, see \url{https://github.com/psychbruce/ChineseNames} } diff --git a/man/givenname.Rd b/man/givenname.Rd index 514c2d3..9d85a49 100644 --- a/man/givenname.Rd +++ b/man/givenname.Rd @@ -12,15 +12,15 @@ A data frame with 25 variables: \item{\code{n.male}}{total counts in male} \item{\code{n.female}}{total counts in female} \item{\code{name.gender}}{difference in proportions of a character used by male vs. female} - \item{\code{n.1930_1959}, \code{n.1960_1969}, ..., \code{n.2000_2008}}{total counts in a birth cohort} - \item{\code{ppm.1930_1959}, \code{ppm.1960_1969}, ..., \code{ppm.2000_2008}}{proportion (parts per million) in a birth cohort} + \item{\code{n.1930_1959}, \code{n.1960_1969}, \code{n.1970_1979}, \code{n.1980_1989}, \code{n.1990_1999}, \code{n.2000_2008}}{total counts in a birth cohort} + \item{\code{ppm.1930_1959}, \code{ppm.1960_1969}, \code{ppm.1970_1979}, \code{ppm.1980_1989}, \code{ppm.1990_1999}, \code{ppm.2000_2008}}{proportion (parts per million) in a birth cohort} \item{\code{name.ppm}}{average ppm (parts per million) across all cohorts} \item{\code{name.uniqueness}}{name-character uniqueness (in naming practices)} \item{\code{corpus.ppm}}{proportion (parts per million) in contemporary Chinese corpus} \item{\code{corpus.uniqueness}}{character-corpus uniqueness (in contemporary Chinese corpus)} - \item{\code{name.valence} (based on subjective ratings from 16 raters, ICC = 0.921)}{name valence (positivity of character meaning)} - \item{\code{name.warmth} (based on subjective ratings from 10 raters, ICC = 0.774)}{name warmth/morality} - \item{\code{name.competence} (based on subjective ratings from 10 raters, ICC = 0.712)}{name competence/assertiveness} + \item{\code{name.valence}}{name valence (positivity of character meaning) (based on subjective ratings from 16 raters, ICC = 0.921)} + \item{\code{name.warmth}}{name warmth/morality (based on subjective ratings from 10 raters, ICC = 0.774)} + \item{\code{name.competence}}{name competence/assertiveness (based on subjective ratings from 10 raters, ICC = 0.712)} } } \usage{