Я знаю, что тег говорит awk, но awk кажется неправильным инструментом для работы, учитывая ожидаемый размер наборов данных. Мой C оказался немного длиннее, чем ожидалось, но частично, потому что я добавил код для проверки завершения строки и длины строки.
[dennis@localhost dna]$ gcc -Wall reindex.c
[dennis@localhost dna]$ ./a.out sequence.dat position.dat
1 T
2 C
7 A
39 T
51 G
Кажется, сработало после того, как я скопировал текст вашего примера sequence-file1 в sequence.dat и position-file2 в position.dat.
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <errno.h>
void usage(int argc,char **argv);
int analyze(
FILE *fp
,long *pLineTextLen /**< OUT: Length of alpha text per line */
,long *pLineBinLen /**< OUT: Total length of line including lf */
);
int reindex(
FILE *seqFp /**< IN: file with sequence to reindex */
,FILE *posFp /**< IN: file with indexes to extract */
,long lineTextLen /**< IN: text to index per line */
,long lineBinLen /**< IN: characters including termintion */
);
int main( int argc, char **argv)
{
int errval;
FILE * seqFp=NULL;
FILE * posFp=NULL;
long lineTextLen;
long lineBinLen;
char *sequenceName=NULL;
int argIdx;
argIdx=1;
if(argIdx >= argc)
{
usage(argc,argv);
errval=-__LINE__;
goto exiterror;
}
seqFp = fopen(argv[argIdx],"r");
if(seqFp == NULL)
{
errval=errno;
fprintf(stderr,"Unable to open %s\n",argv[argIdx]);
goto exiterror;
}
sequenceName = argv[argIdx];
argIdx++;
if(argIdx >= argc)
{
usage(argc,argv);
errval=-__LINE__;
goto exiterror;
}
posFp = fopen(argv[argIdx],"r");
if(posFp == NULL)
{
errval=errno;
fprintf(stderr,"Unable to open %s\n",argv[argIdx]);
goto exiterror;
}
errval = analyze(seqFp,&lineTextLen,&lineBinLen);
if(errval)
{
fprintf(stderr,"Unable to estimate line length of %s\n"
,sequenceName);
errval=-__LINE__;
goto exiterror;
}
errval = reindex(seqFp,posFp,lineTextLen,lineBinLen);
if(errval)
{
fprintf(stderr,"Unable to reindex (errval=%i)\n"
,errval);
goto exiterror;
}
exiterror:
if(seqFp != NULL)
{
fclose(seqFp);
seqFp=NULL;
}
if(posFp != NULL)
{
fclose(posFp);
posFp=NULL;
}
return(errval);
}
void usage(int argc,char **argv)
{
(void)argc; /* yes I'm ignoring it atm */
fprintf(stderr,"%s {seqeuence-file} {position-file}\n"
,argv[0]);
return;
}
/*********************************************************************/
/** Analyze file to determine line lenth
*
* Analyze first few lines of file for identical length text
* lines consisting only of alpha text.
*
* return non-zero if lines not consistent or other error.
*********************************************************************/
int analyze(
FILE *fp
,long *pLineTextLen /**< OUT: Length of alpha text per line */
,long *pLineBinLen /**< OUT: Total length of line including lf */
)
{
int input;
int lineTextLen=0;
int lineBinLen=0;
int confirmCount=0;
int count=0;
enum
{
TEXT_READ=0,
TERM_READ=1
}
state= TEXT_READ;
do
{
input=fgetc(fp);
if(input != EOF)
{
if(isalpha(input))
{
if( state == TERM_READ)
{
state = TEXT_READ;
if(lineBinLen != 0 )
{
if( count != lineBinLen )
{
/* mismatch */
goto exiterror;
}
confirmCount++;
}else
{
lineBinLen=count;
}
count=0; /* start new line */
}
count++;
}
else if( ( input == '\r' )
|| (input == '\n')
|| isblank(input) )
{
if(state == TEXT_READ)
{
state = TERM_READ;
if(lineTextLen!=0)
{
if(lineTextLen != count )
{
/* mismatch */
goto exiterror;
}
confirmCount++;
}
else
{
lineTextLen=count;
}
}
count++;
}
}
}
while(input!=EOF
&& confirmCount<4); /* 2 text and 2 bin */
exiterror:
rewind(fp);
if( pLineTextLen )
{
*pLineTextLen = lineTextLen;
}
if( pLineBinLen )
{
*pLineBinLen = lineBinLen;
}
return(confirmCount<4); /* non-zero if not confirmed */
}
/**********************************************************************/
/** reindex sequence file to std out.
*
* Print char at specified character indexes in sequence file.
* Character indexes are one-based index of characters in
* seq file not including line terminations. Line length and
* termination are assumed to be consistent and specified by
* passed parameters.
*
* Indexes are read as text strings one per line from pos file.
*
* /return non-zero on error.
*********************************************************************/
int reindex(
FILE *seqFp /**< IN: file with sequence to reindex */
,FILE *posFp /**< IN: file with indexes to extract */
,long lineTextLen /**< IN: text to index per line */
,long lineBinLen /**< IN: characters including termintion */
)
{
int errval=0;
char buffer[80];
char *pInput=NULL;
long index;
long lines;
long seekPos;
int sequence;
do
{
pInput=fgets(buffer,sizeof(buffer),posFp);
if( (pInput != NULL)
&& ( !isalnum(pInput[0]) )) /* empty line */
{
pInput=NULL;
}
if(pInput != NULL)
{
index=strtol(pInput,NULL,0);
if(index==0)
{
errval=-__LINE__;
goto exiterror;
}
index--; /* switch to zero based index */
/* integer truncated division expected below */
lines=index/lineTextLen;
seekPos= ( ( lines * lineBinLen )
+ ( index - lines * lineTextLen ) );
fseek(seqFp,seekPos,SEEK_SET);
sequence=fgetc(seqFp);
if(sequence == EOF)
{
errval=-__LINE__;
goto exiterror;
}
fprintf(stdout,"%li\t%c\n"
,index+1 /* convert back to one based */
,sequence);
}
}while(pInput!=NULL);
exiterror:
return(errval);
}
person
dennis
schedule
07.06.2014
(NR-1)*length+i
- person isosceleswheel   schedule 07.06.2014