``
`` Scriptol Spy Links 1.1
`` (c) 2009 Denis Sureau - Scriptol.com
``

`` Free under the GNU GPL 3 License.
`` Requires the PHP interpreter.
`` Sources are compiled with the Scriptol PHP compiler 7.0
``
`` The program checks the social aspect of a website through external links.
`` Read the manual for details of use at: 
`` http://www.scriptol.com/scripts/spy-links.php.
``

`` This program is free software: you can redistribute it and/or modify
`` it under the terms of the GNU General Public License as published by
`` the Free Software Foundation, either version 3 of the License, or
`` (at your option) any later version.

`` This program is distributed in the hope that it will be useful,
`` but WITHOUT ANY WARRANTY; without even the implied warranty of
`` MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
`` GNU General Public License for more details.

`` You should have received a copy of the GNU General Public License
`` along with this program.  If not, see <http://www.gnu.org/licenses/>.
``


include "path.sol"
include "dom.sol"
include "url.sol"

boolean RECURSE = false     // True to scan the whole site
boolean PROCESSDEFAULT = true  // Check for duplicate URLs due to trailing slash
boolean SPECIFICFLAG = false
boolean BROKEN = false            // Does not check the links
int LIMIT = -1   
int TOPLIMIT = 10

int linkcount       // Number of links checked
int brocount        // Number of bad links
int elinks          // Number of links toward other websites
int ilinks          // Number of internal links

text hostsite       // Domain of the checked site
text specific       // Domaine searched specifically
text currentPage    // Page where the links were found

array toplinks      // List of external linked sites
array params = {}
dict checked = {}  // List of checked links
array scanned = {}  // List of scanned pages
dict domains = {}   // List of domains linked
array retrieved = {} // List of links to  a specific domains

// How to use this program

void usage()
	print
	print "Spy Links - (c) 2009 Scriptol.com - Freeware"
	print "--------------------------------------------"
	print "Syntax:"
	print "  solp spylin [options] url"
	print "Options:"
	print "  -{n}       maximal number of pages, default all."
	print "  -t{n}      change number of top site to display."
	print "  -s{domain} search for links to a specific website."
	print "  -c         check links, slower."
	print "  -v         verbose, display more infos."
	print "  -q         quiet, display nothing."
	print "Arguments:"
	print "  url: http address of a page, usually the home page."
	print "Logs stored into 'links.log'."
	print "More info at: http://www.scriptol.com/scripts/"
	exit(0)
return


// Extract website and webpage's filename

text, text splitSite(text url)
    int pos = url.find('/', 8)
    if pos = nil  // no file
        text ext = Path.getExtension(url)
        if ext not in extensions return url, ""  // site only 
        die("$url not a valid url")
    /if    
    text site = url[0 -- pos]
    text filename = url[pos + 1 ..]
return site, filename    

// Extract domain name

text getDomain(text url)
    text site = parse_url(url, PHP_URL_HOST)
    site = site.lower()
    
    // removing any sub-domains including www
    
    if site.length() > 4
        text t1 = strstr(site, ".")
        text t2 = strrchr(site, ".")
        while t1 > t2
          site = t1[ 1 ..] 
          t1 = strstr(site, ".")
        /while           
    /if    
    //print url, "-", site
return site    

// Internal link with protocol and website?

boolean isInternal(text url)
    int l = website.length()
    url = url.lower()
    
   //print url, website, url[.. l]
    if website = url[0 -- l] return true
return false    


// Processing a link: check its status

int checkLink(text url)
    int status
    text d = getDomain(url) // get domain
    
    linkcount + 1
    if DEBUG print "Checking $url"
    if @array_key_exists(url, checked) return checked[url]
    
    // Check link if asked or internal
    if BROKEN 
        status = sockAccess(url, FORCERETRY)
    else    
        status = 200        
    /if    
    
    checked[url] = status
    
    if not QUIET and not VERBOSE echo "."

    if status != 200
       brocount + 1
       if ((status = 404) and not QUIET)
            print "Broken", url
       /if
       return status     
    /if  
    
    // internal link

    if d = hostsite
        ilinks + 1
        return status
    /if

    // external link
    
    elinks + 1
    
    int n = 0
    if domains[d] != false
       n = domains[d]
    /if
    domains[d] = n + 1
    
    if SPECIFICFLAG = true
        if d = specific
            if currentPage not in retrieved let  retrieved.push(currentPage)
            if VERBOSE print "\n", specific, "linked in", currentPage
        /if    
    /if            
    
return status


//  Extract links and return an array of the list

array pageScan(text fname, text caller)

    DOMNode current = null
    DOMElement elem = null
    boolean xres
  
    array links = {}
  
    if VERBOSE print "Scanning", fname
 
    currentPage = fname 
 
    DOMDocument d = DOMDocument()
  
  ~~
    $xres = @$d->loadHTMLFile($fname);
  ~~  

    if xres = false
        if VERBOSE print "Error \"$fname\" not found in $caller"
        brocount + 1
        return array()
    /if  

    DOMNodeList dnl = d.getElementsByTagName("a")
    if dnl.length = 0 return {}
    for int i in 0 .. dnl.length
        current = dnl.item(i)
        if current = null continue
        elem = current
        if elem.hasAttribute("href")
            links.push(elem.getAttribute("href"))
        /if
    /for
  
return links


// Checking the page

void httpCheck(text page, text caller)

	array links
	array todo
	text reldir, src, ext

    if LIMIT >= 0
       if scanned.size() >= LIMIT return 
    /if 

    if trim(page) = nil return
    if page[0] = "." return
    if @array_key_exists(page, scanned) return
    scanned[page] = 200
    checked[page] = 200
        
    if DEBUG print "Entering $page "
    
    differed = "\n$page\n" + "-".dup(page.length())
    DIFFEREDFLAG = true

    text infos = pathinfo(page)
    reldir = @strtolower(infos['dirname'])
    src = @strtolower(infos['filename'])
    ext = @strtolower(infos['extension'])

    if substr(page, -1, 1) = "/"
        int l = strlen(website)
        reldir = page // substr(page, 0, l)
        src = ""
    else        
        infos = pathinfo(page)
        reldir = @strtolower(infos['dirname'])
        src = @strtolower(infos['filename'])
        ext = @strtolower(infos['extension'])
        if ext <> nil
            ext = "." + ext 
            if ext not in extensions return
            src + ext
        /if    
    /if
    
    if DEBUG print "Processing  $reldir/$src"
   
    links = pageScan(page, caller) 
    if links.size() = 0 return         // get list of links into links  
    
    int l = links.size()
    for int i in 0 -- l
        text link = links[i]
        if link[0] = "#"  continue
        int p = strpos(link, "#", 0)
        if p <> 0
          link = link[0 -- p]
        /if
        if not hasProtocol(link)   
            if link.length() > 6
                if link[ .. 2 ] = "../" 
                    if VERBOSE print "Should be absolute: $link in $page"
                    continue
                /if
                if link[ .. 6] = "mailto:"
                    if DEBUG print "Skipped mailto." 
                    continue
                /if     
            /if
            link = Path.merge(reldir, link)
        /if

        if trim(link) = nil continue 
        if @array_key_exists(link, checked) 
           display(checked[link], link, false)
           continue
        /if   
        
        if isInternal(link)
            if PROCESSDEFAULT
                if link[ -1 ..] = "/" 
                    text home = findDefault(link)
                    if @array_key_exists(home, checked) = false
                       checked[home] = 200
                    /if   
                /if
            /if
            todo.push(link)
        /if
        checkLink(link)

   /for
   
   // scan pages that are internal and checked, but not scanned yet 
   for text link in todo
       if @array_key_exists(link, scanned) continue
       if @array_key_exists(link, checked) = false continue
       if checked[link] = 200
            httpCheck(link, page)
       /if     
   /for         

return


// Connect in http mode and call the checking function

void httpProcess(text page)
    if page[ -1 ..] = "/"
        page = findDefault(page)
    /if
    httpCheck(page, "command line")
return


// Parsing command line parameters
// Stored into an array to overcome problems with PHP's global variables

void processCommand(int argnum, array arguments)

	text opt

	if argnum <  2
		usage()
	/if	

	for text param in arguments
		if param.length() > 1
			opt = param[..1]
		else
			usage()
		/if
        
		if opt 
        = "-v" 
			VERBOSE = true
			continue
		= "-q" 
			QUIET = true
			continue
		= "-u" 
			DEBUG = true
			continue
		= "-c"
            BROKEN = true
            continue	
        = "-y"
            FORCERETRY = true
            continue
        = "-s"
            SPECIFICFLAG = true
            specific = param[2 ..]
            if specific = "" let usage()
            continue    
        = "-t"
            int x = int(param[2 ..])
            TOPLIMIT = 0
            continue        
		/if
		
		if param[0] = "-"
		  int x = int(param[1 ..])
		  if x > 0
		      LIMIT = x
		      continue
		  /if    
		
		/if
    
		if param[ .. 4] = "http:"
			server = param
			continue
		/if	

		if param[0] = "-" 
           print "Unknown command $param"  
           usage()
        /if   
		
		if server = nil
			server = param
			continue
		/if	
		
		print "Unknown command $param"
    
        usage()
		
	/for

    if server = nil 
       die("You must provide a URL.")
    /if
  
	params["server"] = server

return

// How a site is (subjectively) evaluated. You can change the formulas here.

text evaluate(int pages)
    if elinks < (pages / 5) return "black hole"
    if elinks < (pages / 2) return "egocentric"
    if elinks >= (pages * 5) return "very friendly"
    if elinks >= (pages * 2) return "friendly"
return "honest"


int main(int argc, array argv)
    global website

    text filename

	array x = argv[ 1 .. ]
	
	processCommand(argc, x)

    server = params["server"]
    //print server
  
    if not hasProtocol(server)
        server = "http://" + server
    /if  

    currentPage = server
    
    website, filename = splitSite(server)
    website = website.lower()
    
    domain = website[7 ..]    
    if substr(domain, -1, 1) = "/"
      domain = domain[ .. -1]
    /if
        
    baseLength = strlen(domain) + 7    // base is domain plus protocol

    hostsite = getDomain(server)
 
    if VERBOSE = true print "Verbose mode enabled"
    if DEBUG = true print "Debug mode enabled"
    echo "Checking "
    if LIMIT > -1 echo LIMIT; else echo "all"
    print " pages on", domain
    print "Starting from $server"
    if SPECIFICFLAG = true print "Searching links to", specific

    log = fopen("links.log", "w")	
    httpProcess(server)

    int sp = scanned.size()
    text ranking = evaluate(sp)    
    
    log.write("Site: " + ranking.upper())
	log.write("$linkcount links checked in $sp pages.")
    log.write("$brocount broken or redirected links, ignored. ")	
	log.close()
	
	if QUIET return 0
	
	print
	echo "This site is : ", ranking.upper(), ".\n"
	print linkcount, "links checked in $sp pages."
	print elinks, "external links found and", domains.size(), "domains."
	print ilinks, "internal links."
	if BROKEN = true
	   echo brocount, "broken links"
	   if brocount > (linkcount / 50)  echo ", not seriously maintained."
       print
    /if    
	
	if SPECIFICFLAG = true
	   int nt = retrieved.size()
	   if nt = 0
	       print "No link found in $hostsite to $specific."
	       exit(0)
	   /if    
	   print specific, "is linked $nt time, from:"
	   scan retrieved
	       print "-", retrieved[]
	   /scan
	   exit(0)
    /if       

	arsort(domains)
	
	int i = 1
	int top = min(TOPLIMIT, domains.size())
	print "Top", top, "sites:"	
	while (i <= top)
	   text key = domains.key()
	   echo pad(i,3, " ", STR_PAD_LEFT), ") ", key, " : ", domains[key], "\n"
	   domains.shift()
	let i + 1   
	
return 0

main($argc, $argv)
