/*
 * $Log: blastParser.cc,v $
 * Revision 2.1  2001/02/20 21:00:07  dmaziuk
 * Few minor changes
 *
 * Revision 2.0  2001/01/11 23:44:43  dmaziuk
 * Version 2 of blast contains complete rewrite of blastParser and major rewrite
 * of (almost) everything else.
 *
// Revision 1.18  2000/10/18  21:49:45  yleigh
// modify blastParser.cc to deal with new HTML tag in blast result, multiple
// lines of name corresponding to one sequence. Disable the blastn search for
// DNA or RNA sequences. Certain blast result don't have db_code for sp entry,
// leave the db_code as ?, since they are retrievable by name. Updated some
// new databases and inserted into starfiles according to the selection rules
// set before.
//
// Revision 1.17  2000/04/07  19:45:08  seol
// modify check_if_updated() to handle the case tags are added of removed
// from the blast output loop.
//
// Revision 1.16  2000/03/31  19:49:53  seol
// Add _Sequence_subject_length in the loop. Fix bugs with bmr4394.str.
//
// Revision 1.15  2000/03/10  22:06:47  seol
// edit comment.
//
// Revision 1.14  2000/02/25  21:50:58  seol
// handles starfiles with two or more seqnences( the case that one has blast
// output, but another has some kind of error). Frame having sequence with blast
// output will be updated.
//
// Revision 1.13  2000/02/22  20:46:26  seol
// change endFlag for blast_out.(was changed by ncbi)
//
// Revision 1.12  1999/11/08  20:57:11  seol
// compare new blast result with old one and update it when there is change(
// update _Sequence_homology_query_revised_last_date tag when revised). coded
// for handling option tag.
//
// Revision 1.11  1999/10/19  19:12:03  seol
// use NMRSTAR formatter to add proper comments to the ouput starfile
//
// Revision 1.10  1999/10/13  18:58:32  seol
// add lines to makefile to configure the memory trimming code in starlib.
//
// Revision 1.9  1999/10/08  19:49:51  seol
// deal with 2 or more sequences, reduce the output(remove _Sequence_alignment_
// details), sort output by same database group
//
// Revision 1.8  1999/07/22  19:18:00  seol
// make user be able to set lower limit of sequence id value.
//
// Revision 1.7  1999/07/07  19:02:20  seol
// change the loop format from tabulate to linear
//
// Revision 1.6  1999/06/30  20:56:37  seol
// fix core dump problem that occured with bmr4356.str
//
// Revision 1.5  1999/06/17  19:41:21  seol
// add code for 'blastn'. pull out embeded data from real data. list the
// entries at most 5 except pdb.
//
// Revision 1.4  1999/03/19  16:36:56  seol
// use "SWISS_PROT" instead of "Swiss-Prot", put '?' instead of '.' for
// entry_mol_name of PDB, put "not_included" instead of '.' for seq_detail_
// value that doesn't contain value. And make entry_mol_name shorter.
//
 * Revision 1.2  1999/03/02  21:31:31  seol
 * Changed table values to use DatavalueNode::DOUBLE instead of
 * DataValueNode::NON values.  Also it now makes a new output file
 * instead of overwriting the input.
 *
 * Revision 1.1  1999/02/17  16:04:49  seol
 * query the BLAST site as if it were an actual web brower by opening
 * a socket to the BLAST 2.0 server and speaking the HTTP protocal
 *
 * $Id: blastParser.cc,v 2.1 2001/02/20 21:00:07 dmaziuk Exp $
 */



/**********************************************************************************
* The class blastParser parses the returned web page (saved as "blast_out" in 
* working directory) to get relevant data. The web page contains search result 
* from the BLAST 2.0 server. 
**********************************************************************************/


#include "blastParser.h"

ifstream fin;

extern bool BLASTP;
extern bool BLASTN;
extern bool CARBO;

extern int limitId;

/*  parse() opens and parses the file that contains search results.
  *    This is the main entry point for blastParser class.
  *
  * ab_entry pointer to vector of dataEntry
  * save_frame pointer to SaveFrameNode
  *
  * return negative value on error, 0 otherwise
  *  -1 error getting residue count
  *  -2 error opening CGI query result file
  */
int blastParser::parse( dataEntries *ab_entries, SaveFrameNode *save_frame )
{
    int res_count;
    int line_cnt = 1;
    char tmp[MAX_DIGIT];
    
// IRIX doesn't seem to have string::getline(istream, string)
    char *str = new char [MAX_STRLEN];        // Grrrrrr!
    vector<string> *lines = new vector<string>;

// get residue count
    if( (res_count = get_residue_count( save_frame )) == -1 ) 
    {
        delete str;
        delete lines;
        return -1;
    }
    
// open the file that contains blast search result
    fin.open( bl_config->getQueryFile() );
    if( !fin ) 
    {
        delete str;
        delete lines;
        return -2;
    }
    
// read lines from input file until we find strFlag
    fin.getline( str, MAX_STRLEN, '\n' );
    while( string( str ).find( blastConfig::strFlag ) == string_npos )
    {
	if( fin.eof() ) return -1;
        fin.getline( str, MAX_STRLEN, '\n' );
    }
    
// skip past <PRE> tag
    while( strncmp( str, "<PRE>", 5 ) != 0 ) 
    {
	if( fin.eof() ) return -1;
        fin.getline( str, MAX_STRLEN, '\n' );
    }
    
// process input data
    do
    {
        line_cnt = getOneDataEntry( lines );
        seq_data sd;
        memset( &sd, 0, sizeof( sd ) );
        getSeqData( &sd, lines, res_count );

// check that sequence id > 98%
        if( (sd.id >= limitId) && ( sd.seq_len > (res_count / 2) ) )
        {
	    int rc = 0;
	    do
	    {
		dataEntry data( ab_entries, bl_config );
// copy data from sd structure
		memset( tmp, 0, MAX_DIGIT );
		strncpy( tmp, sd.exp, MAX_DIGIT - 1 );
		data.setExp( tmp );
		data.setLength( sd.len );
		data.setSeqLen( sd.seq_len );
		data.setComp( sd.comp );
		data.setId( sd.id );
		data.setPos( sd.pos );
		data.setIdCount( sd.id_count );
// parseOneDataEntry() sets up db name, code and mol. name
		rc = parseOneDataEntry( &data, lines );
#ifdef DEBUG
  cout << endl << endl << "parseOneDataEntry() done, line count = " << line_cnt;
  cout << " Expect = " << sd.exp << "/" << data.getExp() << endl;
  data.print();
#endif
		if( rc > 0 ) ab_entries->add(  data );
	    } while( rc > 0 );            
        } // endif sd.id > limitId
//NOTE: this assumes entries are ordered by id desc., so once we got past 98%
// there's no point in continuing. Remove the else block below if that changes.
	else
	{
	    lines->erase( lines->begin(), lines->end() );
	    break;
	}
	for( vector<string>::iterator i = lines->begin(); i != lines->end(); i++ )
	    i->erase( i->begin(), i->end() );
        lines->erase( lines->begin(), lines->end() );
    } while( (line_cnt > 0) && (!fin.eof()) );

    delete str;
    delete lines;
    return 0;
} //--------------------------------------------------------------------

/** parseOneDataEntry() parses lines vector and  fills in a dataEntry object.
  *
  * @param data_entry output
  * @param lines input data
  *
  * @return negative value on error, 0 = stop processing, 1 = more entries follow
  */
int blastParser::parseOneDataEntry( dataEntry *data, vector<string> *lines )
{
    string::size_type start, end;
    string tmp_name;
    string db_name;
    string db_code;
    string mol_name;
    vector<string>::iterator i;

// parse lines and extract db entries
    i = lines->begin();

#ifdef DEBUG
  cout << endl << "blastParser::parseOneDataEntry(0) : line = " << endl;
  cout << i->c_str() << endl << endl;
#endif

// "Length = " is the end of db entries, stop parsing
    if( (start = i->find( "Length = " )) != string_npos ) 
        if( start >= blastConfig::lengthOffset ) return 0;

// get db name
// NOTE: if you add more db names to db names map in blastConfig, 
// make sure you also add those names and search chars to the sets below
    start = i->find_first_of( "bdefgijlmnprs" );
    end = i->find_first_of( "|" );
    tmp_name.append( i->substr( start, end - start ) );
// if name is not in the map, barf and return
    db_name.append(blastConfig::getName( tmp_name ));
    if( db_name == "" )
    {
        cerr << "**** Error: db name " << tmp_name << " is not in the map, input:" << endl;
	cerr << i->c_str() << endl;
	return 0;
    }
    data->setName( db_name );
    string_erase( db_name );
    string_erase( tmp_name );

// get accession code
    start = end + 1;
    end = i->find_first_of( "|", start );
// if no code (eg. pir or prf db) use '?'
    db_code.append( (( start == end ) ? string( "?" ) : i->substr( start, end - start )) );
    data->setCode( db_code );
    string_erase( db_code );

// get mol_name
    start = end + 1;
    mol_name = i->substr( start );

// for mol. names on more then one line
    bool done = false;
    while( !done )
    {
	i++;
#ifdef DEBUG
  cout << "blastParser::parseOneDataEntry() : processing mol_name" << endl;
  cout << "mol_name |" << mol_name << endl;
  cout << "input    |" << i->c_str();
#endif	
// "Length" or database name => end of mol_name
        start = i->find( "Length =" );
	if( (start != string_npos) && (start >= blastConfig::lengthOffset) )
	    done = true; 
	else
	{
	    for( int j = 0; j < blastConfig::NumDbs; j++ )
	    {
                string_erase( tmp_name );
	        tmp_name.append( blastConfig::getCode( j ) );
	        tmp_name.append( "|" );
	        start = i->find( tmp_name );
// db name must be in the 1st or 2nd column
		if( (start != string_npos ) && (start < 2) )
		{
		    done = true;
		    break;
		}
	    } // endfor
	} // endif
#ifdef DEBUG
  cout << "done = " << ((done) ? "true" : "false") << endl;
#endif
// rewind the buffer or append the line to mol_name
	if( done ) i--;
	else
	{
	    start = i->find_first_not_of( " \t" );
//	    end = i->find_last_not_of( " \t", start );
	    end = i->size();
	    mol_name.append( " " );
	    mol_name.append( i->substr( start, end - start ) );
#ifdef DEBUG
  cout << "mol_name is now |" << mol_name << endl;
#endif
	}
    } // endwhile

// extracted one entry, now remove prosessed lines from input,
    lines->erase( lines->begin(), i + 1 );
// save data
    data->setMolName( mol_name );
// and clean up the mess
    string_erase( mol_name );
    string_erase( tmp_name );

// extract query string
    if( bl_config->isSeqDet() )
    {	
// skip over sequence details
	while( i->find( "Query:" ) == string_npos ) i++;
// get sequences
	while( i != lines->end() )
	{
	    tmp_name.append( i->c_str() );
	    tmp_name.append( "\n" );
	    i++;
	}
	data->setDetails( tmp_name );
    }
    string_erase( tmp_name );
#ifdef DEBUG
  cout.flush();
#endif
    return 1;
} //-------------------------------------------------------------------

/** isUpdated() returns true if results of CGI query are different from
  * what's already in DB
  *
  * @param vector of data entries
  * @param SaveFrameNode
  *
  * @return true if DB needs to be updated
  */
bool blastParser::isUpdated( dataEntries *data, SaveFrameNode *frame )
{
    List<ASTnode*> *match;
    DataLoopNode *orig_loop;
    DataLoopNameListNode *loop_name;
    LoopTableNode *loop_table;
    DataValueNode *curValue;
    
    int row_cnt, col_cnt, cur_row, cur_col, i;
    
    match = frame->searchForTypeByTag( ASTnode::DATALOOPNODE,
                                       string( "_Database_entry_mol_name" ) );
    if( match->size() == 0 )
    {
        if( data->size() == 0 ) return false;
        else return true;
    }
    else
    {
        orig_loop = (DataLoopNode *)((*match)[0]);
        loop_name = orig_loop->getNamesPtr();
        loop_table = orig_loop->getValsPtr();
        
        col_cnt = (*loop_name)[0]->size();
        row_cnt = loop_table->size();
// number of columns changed        
        if( col_cnt != blastConfig::NumCols ) return true;
// new entries added
        if( row_cnt != data->size() ) return true;
// compare entries
        cur_row = 0;
        for( dataEntries::iterator e = data->begin(); e != data->end(); e++, cur_row++ )
        {
            if( cur_row >= row_cnt ) break; // just in case, row_cnt = data->size() here

// compare db_name, db_code and mol_name
	    curValue =  (*(*loop_table)[cur_row])[0];
	    if( curValue->myValue() != e->getName() ) return true;
	    curValue =  (*(*loop_table)[cur_row])[1];
	    if( curValue->myValue() != e->getCode() ) return true;
	    curValue =  (*(*loop_table)[cur_row])[2];
	    if( curValue->myValue() != e->getMolName() ) return true;
// sequence completentess
            curValue =  (*(*loop_table)[cur_row])[3];
            if( e->getComp() != strtod( curValue->myValue().c_str(), NULL ) )
                return true;
// sequence id
            curValue = (*(*loop_table)[cur_row])[4];
            if( e->getId() != strtod( curValue->myValue().c_str(), NULL ) )
                return true;
// positives
            curValue = (*(*loop_table)[cur_row])[5];
            if( e->getPos() != strtod( curValue->myValue().c_str(), NULL ) )
                return true;
// expect
            curValue = (*(*loop_table)[cur_row])[6];
            if( e->getExp() == curValue->myValue() )
                return true;
        } // endfor
    } // endif match->size() != 0
    
    return false;
} //---------------------------------------------------------------------

/** getOneDataEntry() reads one data entry chunk from "blast_out" and
  * stores it in lines vector. NOTE: this function adds strings to lines vector.
  *
  * @param vector of strings where processed entry will be stored
  * @return number of lines read. 0 = end of file, < 0 = error
  */
int blastParser::getOneDataEntry( vector<string> *lines )
{
    streampos pos;
    int rc = 0;
    int score_cnt = 0;
    char *str = new char [MAX_STRLEN];

// check for eof
    if( fin.eof() ) 
    {
        delete str;
        return 0;
    }
    
// read in 1st line of data entry ( "><a name = " )
    fin.getline( str, MAX_STRLEN, '\n' );
    removeTags( str );
    lines->insert( lines->end(), str );
    rc++;
    
// fin.eof() should never happen
    while( !fin.eof() )
    {
        pos = fin.tellg();
        memset( str, 0, MAX_STRLEN );
        fin.getline( str, MAX_STRLEN, '\n' );

// check if we're done
        if( string( str ).find( blastConfig::endFlag ) != string_npos )
        {
            delete str;
            return 0;
        }
// next data entry -- rewind & return
        if( strncmp( str, "><a name =", 10 ) == 0 )
        {
            fin.seekg( pos );
            break;
        }
// parse input
        if( strstr( str, "Score" ) != NULL ) score_cnt++;
        if( score_cnt <= 1 )
        {
// read until we get to </PRE>, then skip to next <PRE>
            if( (strstr( str, "</PRE>" ) == NULL) && (strstr( str, "<PRE>" ) == NULL) )
            {
                removeTags( str );
                lines->insert( lines->end(), str );
                rc++;
            }
            else
                while( strstr( str, "<PRE>" ) == NULL )
                    fin.getline( str, MAX_LINE, '\n' );
        } // endif score_cnt <= 1
    } // endwhile

    delete str;

//#ifdef DEBUG
//    cout << "blastParser::getOneDataEntry() lines = " << lines->size() << endl;
//    for( vector<string>::iterator i = lines->begin(); i != lines->end(); i++ )
//        cout << i->c_str() << endl;
//#endif

    return rc;
} //-------------------------------------------------------------------------------

/** removeTags() removes html tags from a C-string
  * 
  * @param string to process
  * @return always 0
  */
int blastParser::removeTags( char *str )
{
    char tmp[MAX_STRLEN];
    bool tag = false;
    int i = 0, j = 0;
    int len;
    
    memset( tmp, 0, MAX_STRLEN );
    len = strlen( str );
    if( len > MAX_STRLEN ) len = MAX_STRLEN;
  
// spec. case: data entry begins with '>'
    if( str[0] == '>' ) i++;
    for( /* i is initialized above */; i < len; i++ )
    {
        if( !tag )
        {
            if( str[i] == '<' ) tag = true;
            else tmp[j++] = str[i];
        }
        else
            if( str[i] == '>' ) tag = false;
    }
    strcpy( str, tmp );
    return 0;
} //-----------------------------------------------------------------------------

/** getSeqData() extracts sequence data from array of strings
  *
  * @param dest pointer to seq_data structure
  * @param lines vector of strings
  * @param res_count residue count, used to compute sequence completeness
  * @return negative value on error
  */
int blastParser::getSeqData( seq_data *dest, const vector<string> *lines, 
                             const int res_count )
{
    float seq_len;
    bool done = false;
    string::size_type start, end;
#ifdef IS_IRIX
//#define IRIX_IS_BRAIN_DEAD_DEAD_DEAD 1
    vector<string>::const_iterator i;
#else
    vector<string>::iterator i;
#endif

    dest->id_count = 0; // just in case

    for( i = lines->begin(); i != lines->end(); i++ )
    {
        if( (start = i->find( "Identities = " )) != string_npos )
            dest->id_count++;
// If multiple "Identities" lines exist, we process only the first one
        if( !done )
        {
// Length: "Length = 104", extract 104
            if( (start = i->find( "Length = " )) != string_npos )
            {
                start = i->find_first_of( "0123456789", start );
                end = i->find_first_not_of( "0123456789", start );
                dest->len = atoi( (i->substr( start, end - start )).c_str() );
            }
// Expect: "Expect = 9e-55", extract 9e-55
            if( (start = i->find( "Expect = " )) != string_npos )
            {
#ifdef DEBUG
  cout << "getSeqDet(): Expect = " << i->c_str() << endl;
#endif
                start = i->find_first_of( "0123456789Ee-", start + 6 );
                end = i->find_first_not_of( "0123456789Ee-", start );
                memset( dest->exp, 0, MAX_DIGIT );
                strncpy( dest->exp, (i->substr( start, end - start )).c_str(), MAX_DIGIT - 1 );
            }
// Identities: "Identities = 101/103 (98%), extract
//  dest->len = 101, seq_len = 103, dest->id = 98 
            if( (start = i->find( "Identities = " )) != string_npos )
            {
                start = i->find_first_of( "0123456789", start );
                end = i->find_first_not_of( "0123456789", start );
                dest->seq_len = atoi( (i->substr( start, end - start )).c_str() );

                start = i->find_first_of( "0123456789", end );
                end = i->find_first_not_of( "0123456789", start );
                seq_len = atof( (i->substr( start, end - start )).c_str() );

                start = i->find_first_of( "0123456789", end );
                end = i->find_first_not_of( "0123456789", start );
                dest->id = atoi( (i->substr( start, end - start )).c_str() );

// Positives: "Positives = 101/103 (98%), extract 98
                if( BLASTP )  // BLASTN doesn't have positive value
                {
                    start = i->find_first_of( "(" );
                    start = i->find_first_of( "0123456789", start );
                    end = i->find_first_not_of( "0123456789", start );
                    dest->pos = atoi( (i->substr( start, end - start )).c_str() );
                }
                else dest->pos = 0;
// if there's more than one "Identities" line, the first one gets top score.
// we need to count the rest
                done = true;
            } // endif "Identities = "
        } // endif !done
    } // endfor

    //compute sequence_completeness using res_count and seq_len
    dest->comp = (seq_len / res_count) * 100;

    return 0;
} //-------------------------------------------------------------------

/** get_residue_count() retrieves residue_count from the input file.
 * It will be used to compute
 * _Sequence_query_to_submitted_percentage((sequence length/ residue count) * 100).
 *
 * @param pointer to SaveFrameNode
 * @return -1 on error, residue count otherwise
 */
int blastParser::get_residue_count( SaveFrameNode *save_frame )
{
    int res_count;
    List<ASTnode*> *match;
    DataItemNode *res_count_ptr;

    match = save_frame->searchForTypeByTag( ASTnode::DATAITEMNODE,
                                            string( "_Residue_count" ) );
    if( match->size() != 1 )
    {
        cout << "***** Error: Save frame contains none or more than one" << endl;
        cout << "             _Residue_count tag." << endl;
        return -1;
    }
    res_count_ptr = (DataItemNode *)( (*match)[0] );
    if( !strcmp( res_count_ptr->myValue().c_str(), "?" ) )
    { 
        cout << "***** Error: _Residue_count tag has \"?\" as value." << endl;
        return -1;
    }
    res_count = atoi( res_count_ptr->myValue().c_str() );
    return res_count;
} //--------------------------------------------------------------------------------





