fakir examples

library(fakir)
library(dplyr)
library(ggplot2)
library(sf)

Fake client database

The database fakes an after-sale client database for a Phone company. There is:

  • a client database with all characteristics of the clients.

  • a ticket database which contains all calls to the after-sale service of some clients having problems

  • Ticket centered dataset with already joined client characteristics

fake_ticket_client(vol = 10)
#> old-style crs object detected; please recreate object with a recent sf::st_crs()
#> # A tibble: 10 × 25
#>    ref           num_client first    last  job     age region id_dpt departement
#>    <chr>         <chr>      <chr>    <chr> <chr> <dbl> <chr>  <chr>  <chr>      
#>  1 DOSS-AMQN-002 79         Jovan    O'Ke… Gene…    22 Île-d… 75     Paris      
#>  2 DOSS-NCKJ-010 69         Miss     Lean… Emer…    68 Bourg… 89     Yonne      
#>  3 DOSS-GPBE-009 120        Odell    Stok… Engi…    24 Rhône… 42     Loire      
#>  4 DOSS-GRLN-001 31         Loren    Lars… <NA>     NA Langu… 66     Pyrénées-O…
#>  5 DOSS-LEPJ-004 59         Maybelle Maye… Furt…    18 Centre 45     Loiret     
#>  6 DOSS-DUCL-005 118        Jamarion Ober… Engi…    18 Picar… 02     Aisne      
#>  7 DOSS-OCED-003 77         Lee      Scha… Admi…    NA Aquit… 24     Dordogne   
#>  8 DOSS-KXSJ-007 65         Demetric Auer  Cont…    21 <NA>   54     Meurthe-et…
#>  9 DOSS-UITD-006 141        Wilfrid  Harv… Educ…    53 <NA>   42     Loire      
#> 10 DOSS-SHKL-008 182        Addyson  Nien… Earl…    65 <NA>   95     Val-d'Oise 
#> # ℹ 16 more variables: cb_provider <chr>, name <chr>, entry_date <dttm>,
#> #   fidelity_points <dbl>, priority_encoded <dbl>, priority <fct>,
#> #   timestamp <date>, year <dbl>, month <dbl>, day <int>, supported <chr>,
#> #   supported_encoded <int>, type <chr>, type_encoded <int>, state <fct>,
#> #   source_call <fct>
  • Separate tickets and client databases
tickets_db <- fake_ticket_client(vol = 100, split = TRUE)
#> old-style crs object detected; please recreate object with a recent sf::st_crs()
tickets_db
#> $clients
#> # A tibble: 200 × 14
#>    num_client first   last     job     age region id_dpt departement cb_provider
#>  * <chr>      <chr>   <chr>    <chr> <dbl> <chr>  <chr>  <chr>       <chr>      
#>  1 1          Solomon Heaney   Civi…    53 Limou… 19     Corrèze     Diners Clu…
#>  2 2          Karma   William… Scie…    81 Pays … 53     Mayenne     VISA 13 di…
#>  3 3          Press   Kulas    Anim…    NA Alsace 67     Bas-Rhin    <NA>       
#>  4 4          Laken   McDermo… <NA>     NA Île-d… 77     Seine-et-M… <NA>       
#>  5 5          Sydnie  Jaskols… Hort…    30 <NA>   29     Finistère   <NA>       
#>  6 6          Clayton Runolfs… Comm…    NA Auver… 63     Puy-de-Dôme Diners Clu…
#>  7 7          Roberta Purdy-W… Fina…    60 Limou… 87     Haute-Vien… <NA>       
#>  8 8          Dr.     RonaldM… Astr…    30 Breta… 29     Finistère   <NA>       
#>  9 9          Miss    Alondra… Occu…    18 Langu… 11     Aude        Diners Clu…
#> 10 10         Vernice Ondrick… Clin…    19 Breta… 56     <NA>        <NA>       
#> # ℹ 190 more rows
#> # ℹ 5 more variables: name <chr>, entry_date <dttm>, fidelity_points <dbl>,
#> #   priority_encoded <dbl>, priority <fct>
#> 
#> $tickets
#> # A tibble: 100 × 10
#>    ref            num_client  year month   day timestamp  supported type   state
#>    <chr>          <chr>      <dbl> <dbl> <int> <date>     <chr>     <chr>  <fct>
#>  1 DOSS-GFEL-0028 1           2018     7    31 2018-07-31 No        Box    Runn…
#>  2 DOSS-UWYV-0016 22          2022     5    23 2022-05-23 No        Box    Wait…
#>  3 DOSS-DKFC-0073 9           2022     6    27 2022-06-27 No        Box    Runn…
#>  4 DOSS-SAYJ-0047 8           2022     7    11 2022-07-11 No        Phone  Over 
#>  5 DOSS-GSMZ-0080 30          2022     7    28 2022-07-28 Yes       Box    Over 
#>  6 DOSS-UIOZ-0085 10          2022     8     9 2022-08-09 Yes       Setti… tech…
#>  7 DOSS-DSMI-0065 37          2022     9     7 2022-09-07 No        Box    tech…
#>  8 DOSS-JOYV-0029 37          2022    10    28 2022-10-28 No        Box    Over 
#>  9 DOSS-WPSG-0013 24          2022    11     4 2022-11-04 No        Setti… Over 
#> 10 DOSS-NHFG-0036 12          2022    11    21 2022-11-21 No        Setti… Over 
#> # ℹ 90 more rows
#> # ℹ 1 more variable: source_call <fct>
  • Explore datasets
ggplot(tickets_db$clients) +
  aes(x = entry_date, y = fidelity_points) +
  geom_point() +
  geom_smooth()
#> `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(tickets_db$tickets) +
  aes(x = type) +
  geom_bar()

ggplot(tickets_db$tickets) +
  aes(x = state) +
  geom_bar()

  • Join with internal {sf} spatial dataset fra_sf. {sf} package must be loaded.
clients_map <- tickets_db$clients %>%
  group_by(id_dpt) %>%
  summarise(
    number_of_clients = n(),
    average_fidelity = mean(fidelity_points, na.rm = TRUE)
  ) %>%
  full_join(fra_sf, by = "id_dpt") %>%
  st_sf()
#> old-style crs object detected; please recreate object with a recent sf::st_crs()

ggplot(clients_map) +
  geom_sf(aes(fill = average_fidelity)) +
  scale_fill_viridis_c() +
  coord_sf(
    crs = 2154,
    datum = 4326
  )

Fake products

  • Create a fake dataset of connected wearables
count(
  fake_products(10),
  category
)
#> # A tibble: 7 × 2
#>   category             n
#>   <chr>            <int>
#> 1 Awesome              2
#> 2 Fitness              1
#> 3 Gaming               1
#> 4 Industrial           1
#> 5 Lifestyle            1
#> 6 Medical              3
#> 7 Pets and Animals     1

Fake website visits

fake_visits(
  from = "2017-01-01",
  to = "2017-01-31"
)
#> # A tibble: 31 × 8
#>    timestamp   year month   day  home about  blog contact
#>  * <date>     <dbl> <dbl> <int> <int> <int> <int>   <int>
#>  1 2017-01-01  2017     1     1   369   220   404     210
#>  2 2017-01-02  2017     1     2   159   250   414     490
#>  3 2017-01-03  2017     1     3   436   170   498     456
#>  4 2017-01-04  2017     1     4    NA   258   526     392
#>  5 2017-01-05  2017     1     5   362    NA   407     291
#>  6 2017-01-06  2017     1     6   245   145   576      90
#>  7 2017-01-07  2017     1     7    NA    NA   484     167
#>  8 2017-01-08  2017     1     8   461   103   441      NA
#>  9 2017-01-09  2017     1     9   337   113   673     379
#> 10 2017-01-10  2017     1    10    NA   169   308     139
#> # ℹ 21 more rows

Fake questionnaire on mean of transport / goal

  • All answers
fake_survey_answers(n = 10)
#> old-style crs object detected; please recreate object with a recent sf::st_crs()
#> # A tibble: 30 × 12
#>    id_individu   age sexe  region           id_departement nom_departement
#>    <chr>       <int> <chr> <chr>            <chr>          <chr>          
#>  1 ID-NYDZ-010    NA <NA>  Haute-Normandie  27             Eure           
#>  2 ID-NYDZ-010    NA <NA>  Haute-Normandie  27             Eure           
#>  3 ID-NYDZ-010    NA <NA>  Haute-Normandie  27             Eure           
#>  4 ID-PWLB-009    71 F     Bretagne         35             Ille-et-Vilaine
#>  5 ID-PWLB-009    71 F     Bretagne         35             Ille-et-Vilaine
#>  6 ID-PWLB-009    71 F     Bretagne         35             Ille-et-Vilaine
#>  7 ID-NMQG-001    42 M     Poitou-Charentes 16             Charente       
#>  8 ID-NMQG-001    42 M     Poitou-Charentes 16             Charente       
#>  9 ID-NMQG-001    42 M     Poitou-Charentes 16             Charente       
#> 10 ID-RJXN-002    71 O     Île-de-France    75             Paris          
#> # ℹ 20 more rows
#> # ℹ 6 more variables: question_date <dttm>, year <dbl>, type <chr>,
#> #   distance_km <dbl>, transport <fct>, temps_trajet_en_heures <dbl>
  • Separate individuals and their answers
fake_survey_answers(n = 10, split = TRUE)
#> old-style crs object detected; please recreate object with a recent sf::st_crs()
#> $individus
#> # A tibble: 10 × 8
#>    id_individu   age sexe  region            id_departement nom_departement     
#>    <chr>       <int> <chr> <chr>             <chr>          <chr>               
#>  1 ID-NYDZ-010    NA <NA>  <NA>              23             Creuse              
#>  2 ID-PWLB-009    71 F     Île-de-France     93             Seine-Saint-Denis   
#>  3 ID-NMQG-001    42 M     Limousin          23             <NA>                
#>  4 ID-RJXN-002    71 O     Corse             2B             Haute-Corse         
#>  5 ID-MROK-007    41 M     Rhône-Alpes       26             Drôme               
#>  6 ID-VMKS-004    33 O     Pays de la Loire  85             Vendée              
#>  7 ID-XEMZ-003    81 O     Aquitaine         64             Pyrénées-Atlantiques
#>  8 ID-EUDQ-005    44 M     Champagne-Ardenne 52             Haute-Marne         
#>  9 ID-DCIZ-008    92 O     Alsace            68             <NA>                
#> 10 ID-KPUS-006    57 O     Rhône-Alpes       74             Haute-Savoie        
#> # ℹ 2 more variables: question_date <dttm>, year <dbl>
#> 
#> $answers
#> # A tibble: 30 × 5
#>    id_individu type      distance_km transport temps_trajet_en_heures
#>    <chr>       <chr>           <dbl> <fct>                      <dbl>
#>  1 ID-NYDZ-010 travail         12.2  voiture                     0.15
#>  2 ID-NYDZ-010 commerces        9.61 bus                         1.01
#>  3 ID-NYDZ-010 loisirs        549.   avion                       0.27
#>  4 ID-PWLB-009 travail         11.9  voiture                     0.14
#>  5 ID-PWLB-009 commerces       27.4  voiture                     0.34
#>  6 ID-PWLB-009 loisirs        210.   train                       0.42
#>  7 ID-NMQG-001 travail          2.38 velo                        0.43
#>  8 ID-NMQG-001 commerces       14.9  voiture                     0.18
#>  9 ID-NMQG-001 loisirs        446.   train                       0.89
#> 10 ID-RJXN-002 travail          6.18 mobylette                   0.75
#> # ℹ 20 more rows

fake transport use

answers <- fake_survey_answers(n = 30)
#> old-style crs object detected; please recreate object with a recent sf::st_crs()
answers
#> # A tibble: 90 × 12
#>    id_individu   age sexe  region           id_departement nom_departement
#>    <chr>       <int> <chr> <chr>            <chr>          <chr>          
#>  1 ID-MROK-007    NA M     Rhône-Alpes      42             Loire          
#>  2 ID-MROK-007    NA M     Rhône-Alpes      42             Loire          
#>  3 ID-MROK-007    NA M     Rhône-Alpes      42             Loire          
#>  4 ID-NYDZ-010    49 M     Basse-Normandie  14             Calvados       
#>  5 ID-NYDZ-010    49 M     Basse-Normandie  14             Calvados       
#>  6 ID-NYDZ-010    49 M     Basse-Normandie  14             Calvados       
#>  7 ID-HXOG-015    50 M     <NA>             69             Rhône          
#>  8 ID-HXOG-015    50 M     <NA>             69             Rhône          
#>  9 ID-HXOG-015    50 M     <NA>             69             Rhône          
#> 10 ID-MZNB-024    70 F     Pays de la Loire 72             Sarthe         
#> # ℹ 80 more rows
#> # ℹ 6 more variables: question_date <dttm>, year <dbl>, type <chr>,
#> #   distance_km <dbl>, transport <fct>, temps_trajet_en_heures <dbl>

ggplot(answers) +
  aes(age, log(distance_km), colour = type) +
  geom_point() +
  geom_smooth() +
  facet_wrap(~type, scales = "free_y")
#> `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#> Warning: Removed 6 rows containing non-finite outside the scale range
#> (`stat_smooth()`).
#> Warning: Removed 6 rows containing missing values or values outside the scale range
#> (`geom_point()`).