-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathregex.c
1923 lines (1578 loc) · 53 KB
/
regex.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/**
* Author: Jack Robbins
* This file contains the implementation for the regex library API defined in
* regex.h . Specifically, this file will generate a state machine that recognizes
* strings belonging to a regular expression
*/
#include "regex.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
//Forward declare
typedef struct NFA_state_t NFA_state_t;
typedef struct NFA_fragement_t NFA_fragement_t;
typedef struct fringe_states_t fringe_states_t;
typedef struct state_list_t state_list_t ;
typedef struct NFA_state_list_t NFA_state_list_t;
typedef struct DFA_state_t DFA_state_t;
/**
* A struct that defines an NFA state
* If opt < 128, we have a labeled arrow stored in out
* If opt = SPLIT, we have two arrows
* If opt = ACCEPTING, we have an accepting state
*/
struct NFA_state_t {
//Was this state visited?
//0 - default
//1 - visited by dfa constructor
//3 - from split
u_int8_t visited;
//The char that we hold
u_int16_t opt;
//The default next
NFA_state_t* next;
//The optional second next for alternating states
NFA_state_t* next_opt;
//The next created state in the linked list
NFA_state_t* next_created;
};
/**
* Define a linked list structure that holds all of the states on the
* fringe of a fragment. States on the fringe of a fragment are the ones
* that will need to be patched into the next fragment, so we should keep track
* of these
*/
struct fringe_states_t {
//The next arrow list struct
fringe_states_t* next;
//The state that we point to
NFA_state_t* state;
};
/**
* A struct that defines an NFA fragement with a list of arrows
* and a start state for that specific fragment. Remember, a big
* NFA can be thought of as a bunch of small ones concatenated, and
* that will be our approach here
*/
struct NFA_fragement_t {
//The start state of the fragment
NFA_state_t* start;
//Keep track of all of the states on the very edge of our fragment
fringe_states_t* fringe_states;
};
/**
* When converting from an NFA to a DFA, we represent each DFA state as a list of reachable
* NFA states. This struct will be used for this purpose
*/
struct NFA_state_list_t {
NFA_state_t* states[145];
//Length of our list
u_int16_t length;
//Does this list contain an accepting state?
u_int8_t contains_accepting_state;
//Does this list have a wildcard?
u_int8_t contains_wild_card;
//Does this list have a NUMBERS state?
u_int8_t contains_numbers;
//Does this list containt LOWERCASE?
u_int8_t contains_lowercase;
//Does this state contain uppercase?
u_int8_t contains_uppercase;
//Does this have all the letters
u_int8_t contains_letters;
};
/**
* A state that we will use for our DFA caching scheme. It is well known that DFA's are more efficent
* than NFAs, so this will help us speed things up
*/
struct DFA_state_t {
//The list of the NFA states that make up the DFA state
NFA_state_list_t nfa_state_list;
//Hold the address of the NFA state
NFA_state_t* nfa_state;
//This list is a list of all the states that come from this DFA state. We will use the char itself to index this state. Remember that printable
//chars range from 0-127
DFA_state_t* transitions[135];
//The next dfa_state that was made, this will help us in freeing
DFA_state_t* next;
};
/**
* An improved version of the postfix converter using an operator stack
*/
char* in_to_post(char* regex, regex_mode_t mode){
//Sanity check
if(regex == NULL || strlen(regex) == 0){
if(mode == REGEX_VERBOSE){
printf("ERROR: Null regex passed in\n");
}
//Get out
return NULL;
}
//Check that all characters passed in are printable characters. If they aren't
//simply exit
for(char* regex_cursor = regex; *regex_cursor != '\0'; regex_cursor++){
if(*regex_cursor < 32 || *regex_cursor > 126){
if(mode == REGEX_VERBOSE){
printf("ERROR: Non-printable character passed in\n");
}
return NULL;
}
}
//Now that we know that we are in the clear here, we can begin allocating some stuff
//Allocate plenty of space for ourselves here
char* regex_with_concatenation = calloc(strlen(regex) * 5, sizeof(char));
/**
* We will now go through and add in the explicit concatenation characters(`)
* The rules for adding these are as follows:
* 1.) Is the next character a regular char or not?
*/
//Preserve the original pointer
char* cursor = regex;
char* concat_cursor = regex_with_concatenation;
//Keep track of the last thing that we saw
char previous_char = '\0';
//Go through and add everything into the regex_with_concatenation string
while(*cursor != '\0'){
switch(*cursor){
//For all of these, we'll just attach it and move on
case '*':
case '+':
case '?':
case '|':
case ')':
//Now add the char in
previous_char = *cursor;
*concat_cursor = *cursor;
cursor++;
concat_cursor++;
break;
//Handle an open parenthesis
case '(':
//This is the one case that we will not concatenate
if(previous_char == '\0' || previous_char == '|' || previous_char == '('){
*concat_cursor = *cursor;
cursor++;
concat_cursor++;
break;
//If we see repetition, we'll add a dummy char in
}
//We'll have to concatenate here
*concat_cursor = '`';
concat_cursor++;
//Add in the parenthesis
previous_char = *cursor;
*concat_cursor = *cursor;
cursor++;
concat_cursor++;
break;
//This is our escape character
case '\\':
//We can add in concatenation here
if(previous_char != '\0' && previous_char != '(' && previous_char != '|'){
*concat_cursor = '`';
concat_cursor++;
}
*concat_cursor = '\\';
//We can use this as the previous char
previous_char = '\\';
concat_cursor++;
cursor++;
//Add whatever we had in
*concat_cursor = *cursor;
concat_cursor++;
cursor++;
break;
//We can see [0-9] or [a-z] or [A-Z] or [a-zA-z]
case '[':
//If the previous char was an open paren, we won't
//add a concatenation
if(previous_char !='\0' && previous_char != '(' && previous_char != '|'){
*concat_cursor = '`';
concat_cursor++;
}
cursor++;
if(*cursor == '0'){
//We need to see this sequence, otherwise it's bad
if(*(cursor + 1) != '-' || *(cursor + 2) != '9' || *(cursor+3) != ']'){
if(mode == REGEX_VERBOSE){
printf("ERROR: Invalid range provided\n");
}
//This is bad so we'll get out
return NULL;
}
} else if (*cursor == 'a'){
//We need to see this sequence, otherwise it's bad
if(*(cursor + 1) != '-' || *(cursor + 2) != 'z'){
if(mode == REGEX_VERBOSE){
printf("ERROR: Invalid range provided\n");
}
//This is bad so we'll get out
return NULL;
}
//Let's see which we actually have here
if(*(cursor+3) == ']'){
//Jump to our basic range
goto basic_range;
} else {
//Otherwise we should have A-Z left
if(*(cursor + 3) != 'A' || *(cursor + 4) != '-' || *(cursor + 5) != 'Z'){
if(mode == REGEX_VERBOSE){
printf("ERROR: Invalid range provided\n");
}
//This is bad so we'll get out
return NULL;
}
//Add this on
strcat(regex_with_concatenation, "[a-zA-Z]");
previous_char = ']';
//Jump ahead here
concat_cursor += 8;
//Jump ahead
cursor += 7;
break;
}
} else if (*cursor == 'A'){
//We need to see this sequence, otherwise it's bad
if(*(cursor + 1) != '-' || *(cursor + 2) != 'Z' || *(cursor+3) != ']'){
if(mode == REGEX_VERBOSE){
printf("ERROR: Invalid range provided\n");
}
//This is bad so we'll get out
return NULL;
}
} else {
if(mode == REGEX_VERBOSE){
printf("ERROR: Invalid range provided\n");
}
//This is bad so we'll get out
return NULL;
}
basic_range:
//Go through and add them in
*concat_cursor = '[';
concat_cursor++;
//These can vary based on what is actually in here
*concat_cursor = *cursor;
concat_cursor++;
*concat_cursor = *(cursor + 1);
concat_cursor++;
*concat_cursor = *(cursor + 2);
concat_cursor++;
*concat_cursor = ']';
concat_cursor++;
//Record the previous char here
previous_char = ']';
//Move ahead
cursor += 4;
break;
//Now we can handle all of our letters
default:
//If the previous char was an open paren, we won't
//add a concatenation
if(previous_char != '\0' && previous_char != '(' && previous_char != '|'){
*concat_cursor = '`';
concat_cursor++;
}
//Save the previous char
previous_char = *cursor;
//Add back in
*concat_cursor = *cursor;
concat_cursor++;
cursor++;
break;
}
}
//Display if we're in verbose mode
if(mode == REGEX_VERBOSE){
printf("With concatenation characters added: %s\n", regex_with_concatenation);
}
/**
* Now that we've added in explicit concatenation, we can go through and use
* the Shunting-Yard algorithm to create the postfix expression
*/
//This will eventually be used for our postfix display
char* postfix = calloc(strlen(regex)*5, sizeof(char));
//Restart the concat cursor
concat_cursor = regex_with_concatenation;
//This ensures we don't lose the start
char* postfix_cursor = postfix;
char stack_cursor;
u_int8_t found_open;
//An operator stack that will hold any operators that we see
stack_t* operator_stack = create_stack();
/**
* Shunting Yard Algorithm:
* Go through the infix expression char by char
* If we see an operator, compare that operator with the operator stack
* If the operator on the top of the stack has a higher precedence, pop it off
* and append to the string. If not, push the new operator onto the stack.
*
* At the very end, we pop all operators off of the stack and place them at the end of the
* string
*/
//Go through the regex with concatenation string
while(*concat_cursor != '\0'){
//Switch based on what character we see
switch(*concat_cursor){
//If we see the escape character, it and whatever
//come after are treated just like regular characters
case '\\':
//Add the explicit escape char in
*postfix_cursor = *concat_cursor;
postfix_cursor++;
concat_cursor++;
//Add whatever was escaped in too
*postfix_cursor = * concat_cursor;
postfix_cursor++;
concat_cursor++;
break;
//Handle kleene start operator
case '*':
//As long as we haven't hit the bottom
while(peek(operator_stack) != NULL){
//Pop off of the stack
stack_cursor = *((char*)peek(operator_stack));
//If it's of equal or higher precedence(?, * or +) we'll pop it off
if(stack_cursor == '*' || stack_cursor == '+' || stack_cursor == '?'){
//Pop it and add it on to the postfix
*postfix_cursor = *((char*)pop(operator_stack));
postfix_cursor++;
//Otherwise it's of lower precedence so we'll leave
} else {
break;
}
}
//This will always get pushed on
push(operator_stack, "*");
//Advance the cursor
concat_cursor++;
break;
//Handle positive closure operator
case '+':
//As long as we haven't hit the bottom
while(peek(operator_stack) != NULL){
//Pop off of the stack
stack_cursor = *((char*)peek(operator_stack));
//If it's of equal or higher precedence(?, * or +) we'll pop it off
if(stack_cursor == '*' || stack_cursor == '+' || stack_cursor == '?'){
//Pop it and add it on to the postfix
*postfix_cursor = *((char*)pop(operator_stack));
postfix_cursor++;
//Otherwise it's of lower precedence so we'll leave
} else {
break;
}
}
//This will always get pushed on
push(operator_stack, "+");
//Advance the cursor
concat_cursor++;
break;
//Handle 0 or 1 operator
case '?':
//As long as we haven't hit the bottom
while(peek(operator_stack) != NULL){
//Pop off of the stack
stack_cursor = *((char*)peek(operator_stack));
//If it's of equal or higher precedence(?, * or +) we'll pop it off
if(stack_cursor == '*' || stack_cursor == '+' || stack_cursor == '?'){
//Pop it and add it on to the postfix
*postfix_cursor = *((char*)pop(operator_stack));
postfix_cursor++;
//Otherwise it's of lower precedence so we'll leave
} else {
break;
}
}
//This will always get pushed on
push(operator_stack, "?");
//Advance the cursor
concat_cursor++;
break;
case '`':
//As long as we haven't hit the bottom
while(peek(operator_stack) != NULL){
//Pop off of the stack
stack_cursor = *((char*)peek(operator_stack));
//If it's of equal or higher precedence(?, * or +) we'll pop it off
if(stack_cursor == '*' || stack_cursor == '+' || stack_cursor == '?'
|| stack_cursor == '`'){
//Pop it and add it on to the postfix
*postfix_cursor = *((char*)pop(operator_stack));
postfix_cursor++;
//Otherwise it's of lower precedence so we'll leave
} else {
break;
}
}
//This will always get pushed on
push(operator_stack, "`");
//Advance the cursor
concat_cursor++;
break;
//Handle the union(|) symbol
case '|':
//As long as we haven't hit the bottom
while(peek(operator_stack) != NULL){
//Pop off of the stack
stack_cursor = *((char*)peek(operator_stack));
//If it's of equal or higher precedence(?, * or + or '`' or `|`) we'll pop it off
if(stack_cursor == '*' || stack_cursor == '+' || stack_cursor == '?'
|| stack_cursor == '`' || stack_cursor == '|'){
//Pop it and add it on to the postfix
*postfix_cursor = *((char*)pop(operator_stack));
postfix_cursor++;
//Otherwise it's of lower precedence so we'll leave
} else {
break;
}
}
//This will always get pushed on
push(operator_stack, "|");
//Advance the cursor
concat_cursor++;
break;
//Handle left parenthesis
case '(':
//If we see this, we simply push it onto the stack and keep moving
push(operator_stack, "(");
//Move on to next character
concat_cursor++;
break;
//Handle the closing parenthesis
case ')':
//Once we see these, we'll need to keep popping off of the stack
//until we either run out or we hit a closing parenthesis
found_open = 0;
//As long as we haven't hit the bottom
while(peek(operator_stack) != NULL){
//Pop off of the stack
stack_cursor = *((char*)pop(operator_stack));
//If it's an end paren then we're done
if(stack_cursor == '('){
found_open = 1;
break;
} else {
//Otherwise, append the operator that we have to the postfix string
*postfix_cursor = stack_cursor;
postfix_cursor++;
}
}
//If this happens, we had an unmatched closing parenthesis. We'll cleanup and get out
if(found_open == 0){
printf("ERROR: Unmatched closing parenthesis");
//Free this because we know it's bad
free(postfix);
free(regex_with_concatenation);
//Cleanup
destroy_stack(operator_stack, STATES_ONLY);
return NULL;
}
//Advance the cursor
concat_cursor++;
break;
//If we're here, we have a regular character so we just append it
default:
*postfix_cursor = *concat_cursor;
concat_cursor++;
postfix_cursor++;
break;
}
}
//Once we reach the end, we'll need to pop everything else that we have on the stack off and append it
//As long as we haven't hit the bottom
while(peek(operator_stack) != NULL){
//Pop off of the stack
stack_cursor = *((char*)pop(operator_stack));
//If we get this, it means that we have an unmatched parenthesis
if(stack_cursor == '('){
printf("ERROR: Unmatched opening parenthesis\n");
//Free this because we know it's bad
free(postfix);
free(regex_with_concatenation);
//Cleanup
destroy_stack(operator_stack, STATES_ONLY);
return NULL;
}
//Otherwise, append the operator that we have to the postfix string
*postfix_cursor = stack_cursor;
postfix_cursor++;
}
//Destroy the whole stack
destroy_stack(operator_stack, STATES_ONLY);
//We don't need this anymore
free(regex_with_concatenation);
//Display if the user wants
if(mode == REGEX_VERBOSE){
printf("Postfix regular expression: %s\n", postfix);
}
return postfix;
}
/* ================================================== NFA Methods ================================================ */
/**
* Create and return a state
*/
static NFA_state_t* create_state(u_int32_t opt, NFA_state_t* next, NFA_state_t* next_opt){
//Allocate a state
NFA_state_t* state = (NFA_state_t*)calloc(1, sizeof(NFA_state_t));
//Assign these values
state->visited = 0;
state->opt = opt;
state->next = next;
state->next_opt = next_opt;
//Must be set later on
state->next_created = NULL;
//Give the pointer back
return state;
}
/**
* Create and return a fragment. A fragment is a partially built NFA. Our system works by building
* consecutive fragments on top of previous fragments
*/
static NFA_fragement_t* create_fragment(NFA_state_t* start, fringe_states_t* fringe_states){
//Allocate our fragment
NFA_fragement_t* fragment = (NFA_fragement_t*)calloc(1, sizeof(NFA_fragement_t));
fragment->start = start;
fragment->fringe_states = fringe_states;
//Return a reference to the fragment
return fragment;
}
/**
* Create a list of fringe states containing just the singular state. This will be used
* when we first create a new fragment for a single character, since when this happens, the
* only thing in the "fringe" is that fragment itself
*/
static fringe_states_t* init_list(NFA_state_t* state){
//Create a new fringe_states_t
fringe_states_t* list = calloc(1, sizeof(fringe_states_t));
//Assign the state pointer
list->state = state;
//The next pointer is NULL, this hasn't been attached to any other states yet
list->next = NULL;
//Return the pointer
return list;
}
/**
* Make the list of states contained in the arrow_list out to point to the start state
* of the next fragement "start"
*
* point_opt will make use of next if 1, next_opt if 0
* NOTE: we really should never use next_opt UNLESS we are a split state, in which case we never even
* get here
*/
static void concatenate_states(fringe_states_t* fringe, NFA_state_t* start, u_int8_t point_opt){
//A cursor so we don't affect the original pointer
fringe_states_t* cursor = fringe;
//Go over the entire outlist
while(cursor != NULL){
//If this has a valid state
if(cursor->state != NULL){
//This fringe states next state should be the new start state
if(point_opt == 1){
cursor->state->next = start;
} else {
cursor->state->next_opt = start;
}
}
//Advance the cursor
cursor = cursor->next;
}
}
/**
* Connect the two linked lists of list_1 and list_2 to be one big linked list
* with list_1 as the head
*/
static fringe_states_t* concatenate_lists(fringe_states_t* list_1, fringe_states_t* list_2){
//A cursor for traversal
fringe_states_t* cursor = list_1;
//Find the tail of list 1
while(cursor->next != NULL){
cursor = cursor->next;
}
//Concatenate the lists
cursor->next = list_2;
return list_1;
}
/**
* Destroy the list of fringe states once we are done using it to avoid any
* memory leakage
*/
static void destroy_fringe_list(fringe_states_t* list){
//Temp for freeing
fringe_states_t* temp;
//Walk the list
while(list != NULL){
//Save list
temp = list;
//Advance the pointer
list = list->next;
//Free the current transition_list_t
free(temp);
}
}
/**
* Ability to print out an NFA for debug purposes
*/
static void print_NFA(NFA_state_t* nfa){
if(nfa == NULL || nfa->visited == 2){
return;
}
if(nfa->opt != ACCEPTING){
nfa->visited = 2;
}
//Support printing of special characters split and accepting
if(nfa->opt == SPLIT_ALTERNATE){
printf("State -SPLIT_ALTERNATE->");
} else if(nfa->opt == SPLIT_ZERO_OR_ONE){
printf("State -SPLIT_ZERO_OR_ONE->");
} else if(nfa->opt == SPLIT_POSITIVE_CLOSURE){
printf("State -SPLIT_POSITIVE_CLOSURE->");
} else if(nfa->opt == SPLIT_KLEENE){
printf("State -SPLIT_KLEENE->");
} else if(nfa->opt == ACCEPTING){
printf("State -ACCEPTING->");
} else {
printf("State -%c->", (u_int8_t)nfa->opt);
}
if(nfa->opt == SPLIT_ALTERNATE || nfa->opt == SPLIT_ZERO_OR_ONE){
print_NFA(nfa->next);
printf("\n");
print_NFA(nfa->next_opt);
} else if(nfa->opt == SPLIT_KLEENE || nfa->opt == SPLIT_POSITIVE_CLOSURE){
print_NFA(nfa->next);
printf("\n");
print_NFA(nfa->next_opt);
nfa->visited = 2;
} else {
print_NFA(nfa->next);
}
}
/**
* Create an NFA from a postfix regular expression FIXME does not work for () combined with *, | or +
*/
static void create_NFA(regex_t* regex, char* postfix, regex_mode_t mode){
//Create a stack for pushing/popping
stack_t* stack = create_stack();
//The head of the linked list
NFA_state_t* head = NULL;
//Declare these for use
NFA_fragement_t* frag_2;
NFA_fragement_t* frag_1;
NFA_fragement_t* fragment;
NFA_state_t* split;
NFA_state_t* s;
//Declare this for our use as well
fringe_states_t* fringe;
//Iterate until we hit the null terminator
for(char* cursor = postfix; *cursor != '\0'; cursor++){
//Grab the current char
char ch = *cursor;
//Switch on the character
switch(ch){
//Concatenation character
case '`':
//Pop the 2 most recent literals off of the stack
frag_2 = (NFA_fragement_t*)pop(stack);
frag_1 = (NFA_fragement_t*)pop(stack);
//We need to set all of the fringe states in fragment 1 to point to the start
//of fragment 2
//Concatenation ALWAYS follows the "next" option without exception
concatenate_states(frag_1->fringe_states, frag_2->start, 1);
//The fringe list of fragment 1 should be irrelevant now, so we can get rid of it
destroy_fringe_list(frag_1->fringe_states);
//Push a new fragment up where the start of frag_1 points is the start, and all of the fringe states
//are fragment 2's fringe states
push(stack, create_fragment(frag_1->start, frag_2->fringe_states));
//However, we do still need the states in frag_2->fringe_states, so we'll leave those be
//We're done with these now, so we should free them
free(frag_1);
free(frag_2);
break;
//Alternate state
case '|':
//Grab the two most recent fragments off of the stack
frag_2 = (NFA_fragement_t*)pop(stack);
frag_1 = (NFA_fragement_t*)pop(stack);
//Create a new special "split" state that acts as a fork in the road between the two
//fragment start states
split = create_state(SPLIT_ALTERNATE, frag_1->start, frag_2->start);
//Linked list attachment
if(head == NULL){
head = split;
} else {
split->next_created = head;
head = split;
}
//Combine the two fringe lists to get the new list of all fringe states for this fragment
fringe_states_t* combined = concatenate_lists(frag_1->fringe_states, frag_2->fringe_states);
//Push the newly made state and its transition list onto the stack
push(stack, create_fragment(split, combined));
//Notice how we won't free any lists here, because they both still hold data about the fringe
//Free these pointers as they are no longer needed
free(frag_1);
free(frag_2);
break;
//0 or more, more specifically the kleene star
case '*':
//Pop the most recent fragment
frag_1 = pop(stack);
//Create a new state. This new state will act as our split. This state will point to the start of the fragment we just got
split = create_state(SPLIT_KLEENE, NULL, frag_1->start);
//Linked list attachment
if(head == NULL){
head = split;
} else {
split->next_created = head;
head = split;
}
//Make all of the states in fragment_1 point to the beginning of the split
//using their next_opt to allow for our "0 or more" functionality
concatenate_states(frag_1->fringe_states, split, 1);
//After concatenation, these are useless to us
destroy_fringe_list(frag_1->fringe_states);
//Create a new fragment that originates at the new state, allowing for our "0 or many" function here
push(stack, create_fragment(split, init_list(split)));
//Free this pointer as it is no longer needed
free(frag_1);
break;
//1 or more, more specifically positive closure
case '+':
//Grab the most recent fragment
frag_1 = pop(stack);
//We'll create a new state that acts as a split, going back to the the original state
//This acts as our optional 1 or more
split = create_state(SPLIT_POSITIVE_CLOSURE, NULL, frag_1->start);
//Linked list attachment
if(head == NULL){
head = split;
} else {
split->next_created = head;
head = split;
}
//Set all of the fringe states in frag_1 to point at the split
concatenate_states(frag_1->fringe_states, split, 1);
//After concatenation, these are useless to us
destroy_fringe_list(frag_1->fringe_states);
//Create a new fragment that represent this whole structure and push to the stack
//Since this one is "1 or more", we will have the start of our next fragment be the start of the old fragment
push(stack, create_fragment(frag_1->start, init_list(split)));
//Free this pointer
free(frag_1);
break;
//0 or 1 instances of the preceding fragment
case '?':
//Grab the most recent fragment
frag_1 = pop(stack);
//We'll create a new state that acts as a split, but this time we won't add any arrows back to this
//state. This allows for a "zero or one" function
//NOTE: Here, we'll use Split's next-opt to point back to the fragment at the start
split = create_state(SPLIT_ZERO_OR_ONE, NULL, frag_1->start);
//Linked list attachment
if(head == NULL){
head = split;
} else {
split->next_created = head;
head = split;
}
//Note how for this one, we won't concatenate states at all, but we'll instead concatentate
//the two fringe lists into one big one because the fringe is a combined fringe
fringe = concatenate_lists(frag_1->fringe_states, init_list(split));
//Create a new fragment that starts at the split, and represents this whole structure. We also need to chain the lists together to keep everything connected
push(stack, create_fragment(split, fringe));
//We won't free the fragments list here because it still holds fringe data
//Free this pointer
free(frag_1);
break;
//If we see the escape character, then we process the immediately next character as a regular char
case '\\':
cursor++;
//Create a new state with the escaped character
s = create_state(*cursor, NULL, NULL);
//Linked list attachment
if(head == NULL){
head = s;
} else {
s->next_created = head;
head = s;
}
//Create a fragment with the fringe states being the new state that we created
fragment = create_fragment(s, init_list(s));
//Push this new fragment to the stack
push(stack, fragment);
break;
//Wildcard
case '$':
s = create_state(WILDCARD, NULL, NULL);
//Linked list attachment
if(head == NULL){
head = s;
} else {
s->next_created = head;
head = s;
}
//Create a fragment, with the fringe states of that fragment being just this new state that we
//created
fragment = create_fragment(s, init_list(s));
//Push the fragment onto the stack. We will pop it off when we reach operators
push(stack, fragment);
break;
//Range numbers
case '[':
//We've already done checking by now to make sure that this is actually valid
if(*(cursor+1) == '0'){
s = create_state(NUMBER, NULL, NULL);
cursor += 4;
} else if (*(cursor + 1) == 'a'){
if(strlen(cursor) > 3 && *(cursor + 4) == 'A'){
s = create_state(LETTERS, NULL, NULL);
cursor += 7;
} else {
s = create_state(LOWERCASE, NULL, NULL);
cursor += 4;
}
} else if (*(cursor + 1) == 'A'){
s = create_state(UPPERCASE, NULL, NULL);
cursor += 4;
}
//Linked list attachment
if(head == NULL){