-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathextract_text.c
executable file
·86 lines (66 loc) · 2.37 KB
/
extract_text.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#include "extract_text.h"
//This function Extracts Text from a line that's found between two specified
//chunks of text 'ItemStart' and 'ItemEnd'
const char *GenericExtractFromLine(const char *Line, const char *ItemName, const char *ItemStart, const char *ItemEnd, ListNode *Vars, int ExtractFlags)
{
const char *ptr;
char *Token=NULL, *Item=NULL, *ptr2;
int len;
int GTF=0;
len=StrLen(ItemStart);
if (ExtractFlags & EXTRACT_WITHIN_QUOTES) GTF=GETTOKEN_QUOTES;
if (len) ptr=GetToken(Line,ItemStart,&Token,0);
else ptr=Line;
if (! StrValid(ItemEnd)) Token=CopyStr(Token,ptr);
else ptr=GetToken(ptr,ItemEnd,&Token,GTF);
//check if the start string occurs more than once in the Token that we've grabbed
if (len) ptr2=strstr(Token,ItemStart);
else ptr2=NULL;
while (ptr2)
{
ptr2+=len;
memmove(Token,ptr2,Token+StrLen(Token)-ptr2+1);
//because of memmove we can strstr in Token again
ptr2=strstr(Token,ItemStart);
}
if (ExtractFlags & EXTRACT_INCLUDE_START)
{
Item=MCopyStr(Item,ItemStart,Token,NULL);
Token=CopyStr(Token,Item);
}
if (ExtractFlags & EXTRACT_DEQUOTE) Item=HTTPUnQuote(Item,Token);
else if (ExtractFlags & EXTRACT_DEHTMLQUOTE) Item=HTMLUnQuote(Item,Token);
else if (ExtractFlags & EXTRACT_DESLASHQUOTE) Item=UnQuoteStr(Item,Token);
else Item=CopyStr(Item,Token);
StripLeadingWhitespace(Item);
StripTrailingWhitespace(Item);
len=StrLen(Item);
if (len > 0)
{
ptr2=Item+len-1;
if ((ExtractFlags & EXTRACT_WITHIN_HTMLTAG) && (*ptr2=='/')) *ptr2='\0';
StripQuotes(Item);
if (ExtractFlags & EXTRACT_NOSPACES) strrep(Item,' ','+');
if ((ExtractFlags & EXTRACT_HTTP_ONLY) && (strncmp(Item,"https:",6)==0))
{
Token=MCopyStr(Token,"http:",Item+6,NULL);
Item=CopyStr(Item,Token);
}
VarsAddDownloadItem(ItemName, Item, Vars, ExtractFlags);
}
Destroy(Token);
Destroy(Item);
return(ptr);
}
void GenericTitleExtract(const char *Line, ListNode *Vars)
{
if (strstr(Line,"<title>")) GenericExtractFromLine(Line, "Title:html","<title>","</title>", Vars,EXTRACT_DEQUOTE);
if (strstr(Line,"<meta name=\"title\" content=\""))
{
GenericExtractFromLine(Line, "Title:meta","<meta name=\"title\" content=\"","\"", Vars,EXTRACT_DEQUOTE);
}
if (strstr(Line,"<meta property=\"og:title\" content=\""))
{
GenericExtractFromLine(Line, "Title:meta","<meta property=\"og:title\" content=\"", "\"", Vars,EXTRACT_DEQUOTE);
}
}