/**
* 'Spider' - делает отчет о структуре сайтов
* @author Andrei Borziak
*/
import java.awt.*;
import java.awt.event.*;
import java.io.*;
import java.util.*;
import java.net.*;
public class spider
implements ActionListener, WindowListener
{
//класс для информации о ссылках
class UrlInfo
{
String m_sUrl; //page addres
String m_sTitle;//page title
int m_iLinks; //from other pages
int m_iStatus; //-1-failed
int m_iSize; //bytes
};
//переменные класса
static private boolean m_bRun = true; //if false then stop processing
private String m_strCmd, m_strPar1, m_strPar2; //command parsing results
private BufferedReader m_br; //current input stream
private BufferedWriter m_bwLog = null; //log-file writer
private boolean m_bOver = false; //if false, do not override existing files
public boolean m_bTraceGet = false;//true - trace get method
private Frame m_frame = null; //window frame
private TextArea m_ta; //text area for output
private String m_strParseData; //data to parse
private int m_iTagStart; //start of tag
private int m_iTagEnd; //end of tag
//главная функция
public static void main( String[] args )
{
String sTask = ( args.length > 0 ) ? args[ 0 ] : "task1.tsk";
if( sTask.equalsIgnoreCase( "help" ) )
{
System.err.println( "Usage: java spider task_file bWindow bOverride bTraceGet bLogAppend" );
}
else
{
boolean bWind = ( args.length > 1 ) ? args[ 1 ].equalsIgnoreCase( "1" ) : true;
boolean bOver = ( args.length > 2 ) ? args[ 2 ].equalsIgnoreCase( "1" ) : true;
boolean bTraceGet = ( args.length > 3 ) ? args[ 3 ].equalsIgnoreCase( "1" ) : false;
boolean bLogAppend = ( args.length > 4 ) ? args[ 4 ].equalsIgnoreCase( "1" ) : false;
spider me = new spider( bWind, bOver );
me.m_bTraceGet = bTraceGet;
me.logOpen( bLogAppend );
me.exec( sTask );
me.logClose();
}
}
//конструктор
public spider(boolean bWind, boolean bOver)
{
m_bOver = bOver;
if( bWind )
{ // create a window
m_frame = new Frame( "Spider" );
m_frame.setBackground( Color.darkGray );
m_frame.setSize( 600, 400 );
m_frame.setLayout( null );
m_ta = new TextArea( 10, 80 );
m_ta.setBackground( Color.white );
m_ta.setBounds( 20, 40, 560, 300 );
m_frame.add( m_ta );
Button btnStop = new Button( "Stop" );
btnStop.addActionListener( this );
btnStop.setBounds( 270, 360, 60, 20 );
m_frame.add( btnStop );
m_frame.setVisible(true);
m_frame.addWindowListener( this );
}
}
//обработчики событий фрейма
public void windowClosing(WindowEvent ev)
{
message( "Process will be terminated..." );
m_bRun = false;
logClose();
System.exit(0);
}
public void windowClosed(WindowEvent ev)
{
}
public void windowOpened(WindowEvent ev)
{
}
public void windowDeiconified(WindowEvent ev)
{
}
public void windowIconified(WindowEvent ev)
{
}
public void windowActivated(WindowEvent ev)
{
}
public void windowDeactivated(WindowEvent ev)
{
}
//обработка кнопок
public void actionPerformed(ActionEvent ev)
{
String label = ev.getActionCommand();
if( label.equalsIgnoreCase("stop") )
{
message( "Process was stopped." );
m_bRun = false;
logClose();
}
}
//разбор команды
private void parseCmd( String strParam )
{
StringTokenizer st1 = new StringTokenizer(strParam, " ", false);
String str;
m_strCmd = null;
m_strPar1 = null;
m_strPar2 = null;
for( int i = 0; i < 3 && st1.hasMoreTokens(); i++ )
{
str = st1.nextToken();
switch( i )
{
case 0:
m_strCmd = str; break;
case 1:
m_strPar1 = str; break;
case 2:
m_strPar2 = str.replace( "\\s", " " );
break;
}
}
}
//выполнить команду за командой из потока
public void exec( String strFileIn )
{
try
{
File file1 = new File( strFileIn );
m_br = new BufferedReader( new FileReader( file1 ) );
String str1 = "task " + strFileIn;
message( str1 );
logWrite( str1 );
while( str1 != null && m_bRun )
{
str1 = m_br.readLine();
if( str1 != null )
{
str1 = str1.trim();
if( str1.equalsIgnoreCase("exit") )
{
str1 = null;
}
else if( str1.length() == 0 || str1.charAt(0) == '#' )
{
//skip
}
else exeCmd( str1 );
}
}
m_br.close();
logWrite( "Process finished" );
message( "Process finished" );
}
catch( Exception e )
{
message( e.toString() );
}
}
//выполнить одну команду
private void exeCmd( String str1 )
{
parseCmd( str1 );
if( m_strCmd != null && m_strCmd.equalsIgnoreCase( "weblinks" ) )
{ //find web links
message( str1 );
if( weblinks( m_strPar1, m_strPar2 ) == 0 )
logWrite( str1 );
else logWrite( "Error:" + str1 );
}
else logWrite( "Unknown: " + str1 );
}
//загрузить файл с сети в память
private int get( String strFileIn, String strFileOut )
{
int iRes = 0;
BufferedWriter bw = null;
StringBuffer sbuf = null;
try {
if( strFileOut != null )
{
File fileOut = new File( strFileOut );
if( !m_bOver && fileOut.exists() && fileOut.length() > 0 )
return 0;
bw = new BufferedWriter( new FileWriter( fileOut ) );
}
else
{
m_strParseData = "";
sbuf = new StringBuffer();
}
URL url = new URL( strFileIn );
BufferedReader in = new BufferedReader(
new InputStreamReader( url.openStream() ) );
String serverOutput;
int i;
for( i = 0; m_bRun; i++ )
{
serverOutput = in.readLine();
if( serverOutput == null )
break;
if( serverOutput.length() == 0 )
continue;
if( bw != null )
bw.write( serverOutput + "\r\n" );
else sbuf.append( serverOutput );
if( m_bTraceGet )
message( serverOutput );
}
message( "get " + i + " lines from " + strFileIn );
in.close();
if( bw != null )
bw.close();
else m_strParseData = sbuf.toString();
}
catch( Exception e )
{
message( e.toString() );
iRes = -1;
}
return iRes;
}
//найти все ссылки сайта, strFileIn как пример http://www.pvobr.ru/index.asp
private int weblinks( String strFileIn, String strFileOut )
{
int iRes = 0, iTodo = 1, i, len, k;
Vector <UrlInfo> vecURL = new Vector <UrlInfo>();
Vector <UrlInfo> vecDone = new Vector <UrlInfo>();
String sRoot = strFileIn, s, sCurDir, sLink;
i = sRoot.lastIndexOf( "/" );
if( i > 0 && sRoot.charAt( i - 1 ) != '/' )
sRoot = sRoot.substring( 0, i );
UrlInfo ui = new UrlInfo();
ui.m_iStatus = 0;
ui.m_iLinks = 0;
ui.m_iSize = 0;
ui.m_sUrl = strFileIn;
ui.m_sTitle = "";
vecURL.add( ui );
while( iTodo > 0 && m_bRun )
{ //get url
ui = (UrlInfo) vecURL.get( iTodo - 1 );
s = ui.m_sUrl;
vecURL.remove( iTodo - 1 );
//get directory, it ends with '/'
sCurDir = s.substring( sRoot.length() );
i = sCurDir.lastIndexOf( "/" );
if( i > 0 ) sCurDir = sCurDir.substring( 0, i + 1 );
if( sCurDir.isEmpty() ) sCurDir = "/";
if( sCurDir.charAt( sCurDir.length() - 1 ) != '/' ) sCurDir = sCurDir + "/";
// message( "curdir " + sCurDir );
//load url into memory
if( vecDone.size() > 3 ) break;
if( s.indexOf( ".pdf" ) > 0 ||
s.indexOf( ".doc" ) > 0 ||
s.indexOf( ".rtf" ) > 0 ||
s.indexOf( ".ppt" ) > 0 ||
s.indexOf( ".zip" ) > 0 ||
s.indexOf( ".rar" ) > 0 ||
s.indexOf( ".txt" ) > 0 )
{ //skip from load
iRes = -2;
}
else
{
iRes = get( s, null );
}
ui.m_iStatus = iRes;
vecDone.add( ui );
logWrite( s );
if( iRes == 0 )
{ //find local links, add to stack
len = m_strParseData.length();
ui.m_iSize = len;
for( i = 0; i < len && i >= 0; )
{
if( getTag( i ) >= 0 )
{
s = m_strParseData.substring( m_iTagStart, m_iTagEnd + 1 );
if( s.equalsIgnoreCase( "<title>" ) )
{
ui.m_sTitle = getTitle( m_iTagEnd + 1 );
}
if( ( s.charAt( 1 ) == 'A' || s.charAt( 1 ) == 'a' ) &&
s.charAt( 2 ) == ' ' )
{ //link tag
s = getAttr( s, "href" );
if( s.isEmpty() ||
s.equalsIgnoreCase("/") ||
s.startsWith("#") ||
s.startsWith("mailto:") ||
s.startsWith("javascript:") ||
s.indexOf( ".jpg" ) > 0 ||
s.indexOf( ".jpeg" ) > 0 ||
s.indexOf( ".gif" ) > 0 ||
s.indexOf( ".png" ) > 0 ||
s.indexOf( ".gif" ) > 0 ||
( s.startsWith("http") && !s.startsWith(sRoot) ) )
{ //ignore: script, pictures or other sites
//message( "bad " + s );
}
else
{ //check if new link
if( !s.startsWith("http") )
{
if( s.startsWith("/") )
{
s = sRoot + s;
}
else if( s.startsWith("../") )
{ //dir up
sLink = sCurDir.substring( 0, sCurDir.length() - 1 ); //cut '/'
k = sLink.lastIndexOf( "/" );
if( k >= 0 ) sLink = sLink.substring( 0, k + 1 );
else sLink = "/";
if( sLink.isEmpty() ) sLink = "/";
s = sRoot + sLink + s.substring( 3 );
}
else if( s.startsWith("./") )
{ //cur dir
s = sRoot + sCurDir + s.substring( 2 );
}
else
{
s = sRoot + sCurDir + s;
}
}
s = s.replaceAll( "&", "&" );
boolean bFound = false;
for( k = 0; k < vecURL.size() && !bFound; k++ )
{
sLink = ((UrlInfo) vecURL.get( k )).m_sUrl;
if( s.equalsIgnoreCase( sLink ) )
{
bFound = true;
((UrlInfo) vecURL.get( k )).m_iLinks++;
}
}
for( k = 0; k < vecDone.size() && !bFound; k++ )
{
sLink = ((UrlInfo) vecDone.get( k )).m_sUrl;
if( s.equalsIgnoreCase( sLink ) )
{
bFound = true;
((UrlInfo) vecDone.get( k )).m_iLinks++;
}
}
if( !bFound )
{
UrlInfo ui1 = new UrlInfo();
ui1.m_iStatus = 0;
ui1.m_iSize = 0;
ui1.m_iLinks = 1;
ui1.m_sUrl = s;
ui1.m_sTitle = "";
vecURL.add( ui1 );
message( "new " + s );
}
}
}
i = m_iTagEnd + 1;
}
else break;
}
}
iTodo = vecURL.size();
}
//sort!
boolean bSort = false;
while( !bSort )
{
bSort = true;
for( i = 0; i < vecDone.size() - 1; i++ )
{
UrlInfo ui1 = (UrlInfo) vecDone.get( i );
UrlInfo ui2 = (UrlInfo) vecDone.get( i + 1 );
if( ui1.m_sUrl.compareTo( ui2.m_sUrl ) > 0 )
{ //change
bSort = false;
vecDone.set( i, ui2 );
vecDone.set( i + 1, ui1 );
}
}
}
File fileLog = new File( strFileOut );
try
{
BufferedWriter bwLog = new BufferedWriter( new FileWriter( fileLog, false ) );
bwLog.write( "<h4>Total " + vecDone.size() + "</h4>\r\n" );
bwLog.write( "<table border=1 cellpadding=2 cellspacing=0 bordercolordark=#ffffff>\r\n" );
bwLog.write( "<tr><th>Url</th><th>Title</th><th>Status</th><th>Size</th><th>From</th></tr>\r\n" );
for( i = 0; i < vecDone.size(); i++ )
{
UrlInfo ui1 = (UrlInfo) vecDone.get( i );
bwLog.write( "<tr><td>" + ui1.m_sUrl + "</td><td>" +
ui1.m_sTitle + "</td><td>" +
ui1.m_iStatus + "</td><td>" +
ui1.m_iSize + "</td><td>" +
ui1.m_iLinks + "</td></tr>\r\n" );
}
bwLog.write( "</table>\r\n" );
bwLog.close();
}
catch( Exception e )
{
message( e.toString() );
}
return iRes;
}
//найти позиции хтмл-тега
private int getTag( int iStart )
{
m_iTagStart = -1;
m_iTagEnd = -1;
int len = m_strParseData.length(), i;
for( i = iStart; i < len; )
{
char c = m_strParseData.charAt( i );
if( c == '<' )
{
if( i + 3 < len && m_strParseData.charAt( i + 1 ) == '!' &&
m_strParseData.charAt( i + 2 ) == '-' &&
m_strParseData.charAt( i + 3 ) == '-' )
{ //comment, skip
int k = m_strParseData.indexOf( "-->", i + 4 );
if( k < 0 ) i = len;
else i = k + 2;
}
else
{
m_iTagStart = i++;
break;
}
}
i++;
}
//find next >
for( ; i < len; i++ )
{
char c = m_strParseData.charAt( i );
if( c == '>' )
{
m_iTagEnd = i;
break;
}
}
return m_iTagEnd;
}
//найти указанный атрибут
private String getAttr( String sTag, String sAttr )
{
String s = sTag.toLowerCase();
int i = s.indexOf( sAttr );
if( i < 0 ) return "";
int len = s.length(), iEQ = -1, iSingle = -1, iDouble = -1, iBad = -1, iEnd = -1;
char cPrev = 0;
for( i = i + sAttr.length(); i < len; i++ )
{
char c = s.charAt( i );
if( c == '=' )
{
if( iEQ < 0 ) iEQ = i;
}
else if( c == '\'' )
{
if( iEQ < 0 ) return "";
if( iSingle < 0 && iDouble < 0 && iBad < 0 )
{ //attr begins with '
iSingle = i;
cPrev = 0;
continue;
}
if( cPrev == '\'' )
{ //remove doubles
cPrev = 0;
continue;
}
else
{ //attr end
iEnd = i + 1;
break;
}
}
else if( c == '"' )
{
if( iEQ < 0 ) return "";
if( iSingle < 0 && iDouble < 0 && iBad < 0 )
{ //attr begins with "
iDouble = i;
cPrev = 0;
continue;
}
if( cPrev == '"' )
{ //remove doubles
cPrev = 0;
continue;
}
else
{ //attr end
iEnd = i + 1;
break;
}
}
else if( c == ' ' )
{
if( iBad > 0 )
{ //attr end
iEnd = i;
break;
}
}
else if( c == '>' )
{
if( iEQ < 0 ) return "";
if( iBad > 0 )
{ //attr end
iEnd = i;
break;
}
}
else
{
if( iEQ < 0 ) return "";
if( iSingle < 0 && iDouble < 0 && iBad < 0 )
{ //attr begins neither with " nor '
iBad = i;
cPrev = 0;
continue;
}
}
cPrev = c;
}
if( iEnd < 0 ) return "";
s = s.substring( iEQ + 1, iEnd ).trim();
if( iSingle > 0 || iDouble > 0 ) s = s.substring( 1, s.length() - 1 );
return s;
}
//найти заголовок
private String getTitle( int iStart )
{
int i;
for( i = iStart; i < m_strParseData.length(); i++ )
{
char c = m_strParseData.charAt( i );
if( c == '<' )
{
return m_strParseData.substring( iStart, i );
}
}
return "";
}
//открыть лог-файл
private void logOpen( boolean bAppendLog )
{
File fileLog = new File( "spider.log" );
try
{
m_bwLog = new BufferedWriter( new FileWriter( fileLog, bAppendLog ) );
}
catch( Exception e )
{
message( e.toString() );
}
}
//закрыть лог-файл
private void logClose()
{
try
{
if( m_bwLog != null )
{
m_bwLog.close();
m_bwLog = null;
}
}
catch( Exception e )
{
message( e.toString() );
}
}
//записать в лог-файл
private void logWrite( String sMessage )
{
try
{
m_bwLog.write( sMessage + "\r\n" );
}
catch( Exception e )
{
message( e.toString() );
}
}
//сообщение на экран
private void message( String mes )
{
if( m_frame != null )
m_ta.append( mes + "\r\n" );
else System.err.println( mes );
}
}