@@ -854,3 +854,79 @@ infer_agenda_from_transcript <- function(
854
854
855
855
agenda
856
856
}
857
+
858
+ # ' Extract entities from a text
859
+ # '
860
+ # ' This function takes a text and extracts entities from it. The entities can be
861
+ # ' people, acronyms, organizations, and concepts. The function returns a vector
862
+ # ' with the entities found in the text. Can be useful to build vocabularies for
863
+ # ' the LLMs starting from an event description or a transcript.
864
+ # '
865
+ # ' @param text The text from which to extract the entities.
866
+ # ' @param entities A character vector with the entities to extract. Can be
867
+ # ' "people", "acronyms", "organizations", and "concepts". Default is all of
868
+ # ' them.
869
+ # ' @param prompt_only If TRUE, only the prompt is returned, the LLM is not
870
+ # ' interrogated. Default is FALSE.
871
+ # ' @param ... Additional arguments passed to the `interrogate_llm` function.
872
+ # '
873
+ # ' @return A vector with the entities found in the text.
874
+ # '
875
+ # ' @export
876
+ # '
877
+ entity_extractor <- function (
878
+ text ,
879
+ entities = c(" people" , " acronyms" , " organizations" , " concepts" ),
880
+ prompt_only = FALSE ,
881
+ ...
882
+ ) {
883
+
884
+ text <- paste(text , collapse = " --------\n\n\n " )
885
+
886
+ acro_or_concepts <- entities [entities %in% c(" acronyms" , " concepts" )]
887
+
888
+ task <- paste0(
889
+ " You will be passed one or more text documents. For each document, you " ,
890
+ " should extract the following entities from the text:\n\n " ,
891
+ sprintf(" -`%s`;" , entities ) | > paste(collapse = " \n " ),
892
+ " \n\n You should return a JSON object of the entities found in the text, with each " ,
893
+ " entity type as a key and a list of the entities of that type as the " ,
894
+ " value. For example, if you find two people and one organization in the " ,
895
+ " text, you should return a list with two keys, 'people' and 'organizations', " ,
896
+ " and the corresponding lists of entities as values.\n\n " ,
897
+ if (length(acro_or_concepts ) > 0 ) {
898
+ paste0(" If you find" , paste(acro_or_concepts , collapse = " or " ),
899
+ " they should be returned list of strings, with each element " ,
900
+ " formatted as 'entity: definition'" ,
901
+ " trying to infer the definition from the context. " ,
902
+ " If you are not 100% sure, or it's self explanatory, just list the concepts" ,
903
+ " as strings.\n\n " )
904
+ },
905
+ " Here is an example of the expected output:\n\n " ,
906
+ ' ```json
907
+ {
908
+ "people": ["John Doe", "Jane Smith"],
909
+ "organizations": ["Acme Corp"],
910
+ "acronyms": [
911
+ "LLM: Large Language Model",
912
+ "NLP: Natural Language Processing"
913
+ ],
914
+ "concepts": [
915
+ "Arxiv: Open access repository of scientific articles",
916
+ "Escherichia coli"
917
+ ]
918
+ }
919
+ ```\n\n ' ,
920
+ " Here is the text from which you should extract the entities:\n\n ####\n\n " ,
921
+ text , " \n\n ####\n\n Provide your JSON output below." )
922
+
923
+ if (prompt_only ) {
924
+ return (task )
925
+ }
926
+
927
+ interrogate_llm(
928
+ c(" system" = get_prompts(" persona" ), " user" = task ),
929
+ force_json = TRUE , ... ) | >
930
+ jsonlite :: fromJSON() | >
931
+ unlist() | > unname()
932
+ }
0 commit comments