Appendix F

URL Splitting Algorithm Pseudo-Code




Below is the pseudo-code defining the algorithm used to split URLs within links into a domain, path and filename. It re-builds relative URLs in sub-pages, and re-combines the individual parts of the URL into a complete URL for download.

Function is passed: 	url, is_subpage
Global variables used:	stored_domain, stored_path
Function returns:		new_url

domain := NULL
path := NULL
filename := NULL

IF url STARTS WITH "http://" THEN
	url := RIGHT OF url AFTER AND NOT INCLUDING "http://"
	IF "/" NOT IN url THEN
		domain := url
		path := "/"
		filename := NULL
	ELSE
		domain := LEFT OF url UP TO AND NOT INCLUDING FIRST "/"
		url := RIGHT OF url AFTER AND INCLUDING FIRST "/"
		filename := RIGHT OF url AFTER AND NOT INCLUDING LAST "/"
		url := LEFT OF url UP TO AND INCLUDING LAST "/"
		path := url
	END IF
ELSE IF url STARTS WITH "mailto:" THEN
	ERROR "This is a link to an email address"
	EXIT
ELSE
	IF ":" IN url THEN
		ERROR "Can only download pages using HTTP"
		EXIT
	END IF
	IF "/" NOT IN url THEN
		path := "/"
		filename := url
	ELSE
		filename := RIGHT OF url AFTER AND NOT INCLUDING LAST "/"
		url := LEFT OF url UP TO AND INCLUDING LAST "/"
		path := url
	END IF
END IF

IF is_subpage = TRUE AND domain = NULL THEN
	domain := stored_domain
	path := stored_path + path
ELSE IF is_subpage = TRUE AND NOT domain = stored_domain THEN
	ERROR "Sub page was not within domain of the main page"
ELSE IF is_subpage = FALSE AND domain = NULL THEN
	ERROR "The URL specified is invalid"
ELSE
	stored_domain := domain
	stored_path := path
END IF

new_url := "http://" + domain + path + filename